diff --git a/docs/group-project-page.md b/docs/group-project-page.md
index 391cb7d..a48cd14 100644
--- a/docs/group-project-page.md
+++ b/docs/group-project-page.md
@@ -91,6 +91,8 @@ The whole group will meet for 5 minutes with the TA to suggest a plan on how to
### Deliverable 6: Individual assignment (4% weight)
+#### Assignment 4: Computational Code and Output
+
This will be the last individual assignment. Write a computation code to implement the method proposed in the previous assignment (or suggested in the interview) and use a visualization or table to report the results. In 3 or 4 sentences give a brief interpretation of the results. If needed, comment on any unexpected result or potential problems with the analysis, and possible ways to address issues encountered. If results are as expected, explain how they address the question of interest.
### Deliverable 7: Group Final Report (5% weight)
diff --git a/materials/tutorial_12/tests_tutorial_12.R b/materials/tutorial_12/tests_tutorial_12.R
new file mode 100644
index 0000000..965565b
--- /dev/null
+++ b/materials/tutorial_12/tests_tutorial_12.R
@@ -0,0 +1,316 @@
+library(digest)
+library(testthat)
+
+test_1.1 <- function() {
+ test_that('Did not assign answer to an object called "Crabs_vs_width_scatterplot"', {
+ expect_true(exists("Crabs_vs_width_scatterplot"))
+ })
+
+ test_that("Solution should be a ggplot object", {
+ expect_true(is.ggplot(Crabs_vs_width_scatterplot))
+ })
+
+ properties <- c(Crabs_vs_width_scatterplot$layers[[1]]$mapping, Crabs_vs_width_scatterplot$mapping)
+
+ test_that("Plot should have width on the x-axis", {
+ expect_true("width" == rlang::get_expr(properties$x))
+ })
+
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomPoint" %in% class(Crabs_vs_width_scatterplot$layers[[1]]$geom))
+
+ # Remove if not needed:
+ # expect_true("GeomVline" %in% class(Crabs_vs_width_scatterplot$layers[[2]]$geom))
+ })
+
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(Crabs_vs_width_scatterplot$data)), "8c2afe893b01f3a8c63e1aef3b5aad9e")
+ expect_equal(digest(round(sum(Crabs_vs_width_scatterplot$data$width))), "17a701ffb51e7429bcd57678dd80b402")
+
+ # If width is not known:
+ # expect_equal(digest(round(sum(pull(Crabs_vs_width_scatterplot$data, rlang::get_expr(properties$x))))), "HASH_HERE")
+ })
+
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(Crabs_vs_width_scatterplot$labels$x == toString(rlang::get_expr(properties$x)))
+ })
+
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(Crabs_vs_width_scatterplot$labels))
+ })
+
+ print("Success!")
+}
+
+test_1.2 <- function() {
+ test_that('Did not assign answer to an object called "Crabs_group_avg_width"', {
+ expect_true(exists("Crabs_group_avg_width"))
+ })
+
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(Crabs_group_avg_width))
+ })
+
+ expected_colnames <- c("width_intervals", "mean_n_males")
+ given_colnames <- colnames(Crabs_group_avg_width)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(Crabs_group_avg_width))), "71db8a6cad03244e6e50f0ad8bc95a65")
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(Crabs_group_avg_width$mean_n_males) * 10e4)), "12273d78c0faff1ea67c8f8e9c42edd0")
+ })
+
+ print("Success!")
+}
+
+test_1.3 <- function() {
+ test_that('Did not assign answer to an object called "Crabs_avg_width_scatterplot"', {
+ expect_true(exists("Crabs_avg_width_scatterplot"))
+ })
+
+ test_that("Solution should be a ggplot object", {
+ expect_true(is.ggplot(Crabs_avg_width_scatterplot))
+ })
+
+ properties <- c(Crabs_avg_width_scatterplot$layers[[1]]$mapping, Crabs_avg_width_scatterplot$mapping)
+
+ test_that("Plot should have width_intervals on the x-axis", {
+ expect_true("width_intervals" == rlang::get_expr(properties$x))
+ })
+
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomPoint" %in% class(Crabs_avg_width_scatterplot$layers[[1]]$geom))
+
+ # Remove if not needed:
+ # expect_true("GeomVline" %in% class(Crabs_avg_width_scatterplot$layers[[2]]$geom))
+ })
+
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(Crabs_avg_width_scatterplot$data)), "71db8a6cad03244e6e50f0ad8bc95a65")
+ expect_equal(digest(round(sum(Crabs_avg_width_scatterplot$data$mean_n_males))), "e1d9279a9999b2d3bb972ab3267b49c7")
+
+ # If width_intervals is not known:
+ # expect_equal(digest(round(sum(pull(Crabs_avg_width_scatterplot$data, rlang::get_expr(properties$x))))), "HASH_HERE")
+ })
+
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(Crabs_avg_width_scatterplot$labels$x == toString(rlang::get_expr(properties$x)))
+ })
+
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(Crabs_avg_width_scatterplot$labels))
+ })
+
+ print("Success!")
+}
+
+test_1.4 <- function() {
+ test_that('Did not assign answer to an object called "answer1.4"', {
+ expect_true(exists("answer1.4"))
+ })
+
+ test_that('Solution should be a single character ("A", "B", or "C")', {
+ expect_match(answer1.4, "a|b|c", ignore.case = TRUE)
+ })
+
+ answer_hash <- digest(tolower(answer1.4))
+
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "127a2ec00989b9f7faf671ed470be7f8")
+ })
+
+ print("Success!")
+}
+
+test_1.5 <- function() {
+ test_that('Did not assign answer to an object called "Crabs_vs_width_scatterplot"', {
+ expect_true(exists("Crabs_vs_width_scatterplot"))
+ })
+
+ test_that("Solution should be a ggplot object", {
+ expect_true(is.ggplot(Crabs_vs_width_scatterplot))
+ })
+
+ properties <- c(Crabs_vs_width_scatterplot$layers[[1]]$mapping, Crabs_vs_width_scatterplot$mapping)
+
+ test_that("Plot should have width on the x-axis", {
+ expect_true("width" == rlang::get_expr(properties$x))
+ })
+
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomPoint" %in% class(Crabs_vs_width_scatterplot$layers[[1]]$geom))
+ })
+
+
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(Crabs_vs_width_scatterplot$data)), "8c2afe893b01f3a8c63e1aef3b5aad9e")
+ expect_equal(digest(round(sum(Crabs_vs_width_scatterplot$data$width))), "17a701ffb51e7429bcd57678dd80b402")
+
+ # If width is not known:
+ # expect_equal(digest(round(sum(pull(Crabs_vs_width_scatterplot$data, rlang::get_expr(properties$x))))), "HASH_HERE")
+ })
+
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(Crabs_vs_width_scatterplot$labels$x == toString(rlang::get_expr(properties$x)))
+ })
+
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(Crabs_vs_width_scatterplot$labels))
+ })
+
+ print("Success!")
+}
+
+
+test_1.6 <- function() {
+ test_that('Did not assign answer to an object called "crabs_Poisson_model"', {
+ expect_true(exists("crabs_Poisson_model"))
+ })
+
+ test_that("Solution should be a glm object", {
+ expect_true("glm" %in% class(crabs_Poisson_model))
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(crabs_Poisson_model$residuals) * 10e4)), "5ab35ebc157c4f75476569c445d5a1cc")
+ })
+
+ print("Success!")
+}
+
+test_1.7 <- function() {
+ test_that('Did not assign answer to an object called "crabs_Poisson_model_results"', {
+ expect_true(exists("crabs_Poisson_model_results"))
+ })
+
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(crabs_Poisson_model_results))
+ })
+
+ expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high')
+
+
+ given_colnames <- colnames(crabs_Poisson_model_results)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(crabs_Poisson_model_results))), "dd4ad37ee474732a009111e3456e7ed7")
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(crabs_Poisson_model_results$conf.low) * 10e6)), "79cef645627327fe2632a29728949fa8")
+ expect_equal(digest(as.integer(sum(crabs_Poisson_model_results$conf.high) * 10e6)), "b2f64ff96ebb309861a640e306397224")
+ expect_equal(digest(as.integer(sum(crabs_Poisson_model_results$statistic) * 10e6)), "09ad422316ee480535d0735aedfb2543")
+ })
+
+ print("Success!")
+}
+
+test_1.8 <- function() {
+ test_that('Did not assign answer to an object called "crabs_Poisson_model_results"', {
+ expect_true(exists("crabs_Poisson_model_results"))
+ })
+
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(crabs_Poisson_model_results))
+ })
+
+ expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high', 'exp.estimate', 'exp.conf.low', 'exp.conf.high')
+
+ given_colnames <- colnames(crabs_Poisson_model_results)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(crabs_Poisson_model_results))), "dd4ad37ee474732a009111e3456e7ed7")
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(crabs_Poisson_model_results$exp.estimate) * 10e6)), "dd50342cffd6d0cde245787cbdd97021")
+ expect_equal(digest(as.integer(sum(crabs_Poisson_model_results$exp.conf.low) * 10e6)), "5aa9e6879f68968bdb958bdc1a4cdf32")
+ expect_equal(digest(as.integer(sum(crabs_Poisson_model_results$exp.conf.high) * 10e6)), "b6866c669fd240e038b16d7e35b90592")
+ })
+
+ print("Success!")
+}
+
+test_1.9 <- function() {
+ test_that('Did not assign answer to an object called "answer1.9"', {
+ expect_true(exists("answer1.9"))
+ })
+
+ answer_hash <- digest(tolower(answer1.9))
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "fd4d64bc84d8d1ac10b94c23bda1a016")
+ })
+
+ print("Success!")
+}
+
+test_1.10 <- function() {
+ test_that('Did not assign answer to an object called "answer1.10"', {
+ expect_true(exists("answer1.10"))
+ })
+
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer1.10, "a|b|c|d", ignore.case = TRUE)
+ })
+
+ answer_hash <- digest(tolower(answer1.10))
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "6e7a8c1c098e8817e3df3fd1b21149d1")
+ })
+
+ print("Success!")
+}
+
+test_1.11 <- function() {
+ test_that('Did not assign answer to an object called "answer1.11"', {
+ expect_true(exists("answer1.11"))
+ })
+
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer1.11, "a|b|c|d", ignore.case = TRUE)
+ })
+
+ answer_hash <- digest(tolower(answer1.11))
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "ddf100612805359cd81fdc5ce3b9fbba")
+ })
+
+ print("Success!")
+}
+
+test_1.12 <- function() {
+ test_that('Did not assign answer to an object called "answer1.12"', {
+ expect_true(exists("answer1.12"))
+ })
+
+ answer_as_numeric <- as.numeric(answer1.12)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "07660fdc17d69e9b645c10b4a1f810be")
+ })
+
+ print("Success!")
+}
diff --git a/materials/tutorial_12/tutorial_12.ipynb b/materials/tutorial_12/tutorial_12.ipynb
new file mode 100644
index 0000000..1130586
--- /dev/null
+++ b/materials/tutorial_12/tutorial_12.ipynb
@@ -0,0 +1,1516 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "2d0f31d79866375b7f3161d0277a32a3",
+ "grade": false,
+ "grade_id": "cell-f1e1d845873036f4",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "# Tutorial 12: Binary or Discrete Counts Responses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d94e81925b1456028fb504d00e3a9284",
+ "grade": false,
+ "grade_id": "cell-82d9926086d47a80",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### Lecture and Tutorial Learning Goals:\n",
+ "After completing this week's lecture and tutorial work, you will be able to:\n",
+ "\n",
+ "1. Describe the logistic regression estimation procedure (categorical data as the response variable and explanatory variables), and Poisson regression estimation procedure (discrete counts as the response variable and explanatory variables).\n",
+ "2. Discuss the relationship between linear regression and logistic and Poisson regression. Discuss the consequences of modeling data that is more suitable for logistic and Poisson regression models as a linear regression model.\n",
+ "3. Interpret the coefficients and $p$-values in the logistic and Poisson regression settings.\n",
+ "4. Discuss useful diagnostics for logistic and Poisson regression and explain why they should be performed.\n",
+ "5. Write a computer script to perform logistic and Poisson regression and perform model diagnostics. Interpret and communicate the results from that computer script."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6dd126a878384f00ec2c6c116565c9ff",
+ "grade": false,
+ "grade_id": "cell-a2a153352bc44a68",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "library(tidyverse)\n",
+ "library(repr)\n",
+ "library(infer)\n",
+ "library(gridExtra)\n",
+ "library(mlbench)\n",
+ "library(AER)\n",
+ "library(ISLR)\n",
+ "library(broom)\n",
+ "library(qqplotr)\n",
+ "library(performance)\n",
+ "library(see)\n",
+ "library(MASS)\n",
+ "library(glmbb)\n",
+ "library(cowplot)\n",
+ "source(\"tests_tutorial_12.R\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d500b40fbbd8d191dc155e0b17c231df",
+ "grade": false,
+ "grade_id": "cell-801b28434bf00853",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 1. Poisson Regression\n",
+ "\n",
+ "Let us proceed with Poisson regression. This class of GLM is intended for cases where the response is a count (i.e., an integer). What do we mean by this? Firstly, we will load the dataset to be used in this section. The data frame `crabs` ([Brockmann, 1996](https://ubc.summon.serialssolutions.com/2.0.0/link/0/eLvHCXMwrV3JasMwEBUlpZBLl7Sl6YY_oE4sy5sgFEpICKU9NadcjFYSmtghCyRf0t-tJC_EPhRaepMHS8jSSPM0M3oGALkdx67tCYRyKV0GacCp9CBX60RCySMcYiigy2qpOq_F1ZiMLqL0v-mFYrZvvd4JXXcPEnOUZbU1z6W-fxdoh6eO-XQUvDyGCnXrbK_hxCkDDCHKgs9KH20FelDOR_pzWxXbdYhljTEanoF10e8iC6V2SbDK9Pg_H3gOTnPsar1kynYBjkTSAs1yC923wMkkNaVL8NVbkNXn8wcxjJ8b0euaZysTvyuzVJUYH9i6Kpsl1ijVOSbTtPZ2f6V6nYuectnbbLGdb2tNLNP5fjkVi1J-BcbDwbg_svOfQNhMIRdsu4KHEnIsIiQYJoGQvsKQmAuOsE8lojRnrPEIlZQHHIUEEs8PBVHQjaJr0EjSRNwAS8ebmMNk4DHkRYgQSt2ARoxRnzuhJG2AitmNlxnVR3x4REI41sMf6-GP8-GPd20QmVn7RZV4MB7p0u3fq96BZpY-rn1B96CxWW3FgyGOeDR6_g1rIQSN)) is a dataset detailing the **counts** of satellite male crabs residing around a female crab nest: `n_males`. \n",
+ "\n",
+ "> The data frame `crabs` contains 173 observations on horseshoe crabs (*Limulus polyphemus*). The response is the count of male crabs (`n_males`) around a female breeding nest. It is subject to four input variables: a factor for the `color` of the prosoma with four levels, a factor for the condition of the posterior `spine` with three levels, the continuous variables for carapace `width` (cm), and `weight` (g).\n",
+ "\n",
+ "Run the cell below before proceeding."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d7f194427d74287ab36175b312da4159",
+ "grade": false,
+ "grade_id": "cell-c42393c3147ab6d0",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "data(crabs)\n",
+ "crabs <- crabs %>%\n",
+ " rename(n_males = satell) %>%\n",
+ " dplyr::select(-y)\n",
+ "\n",
+ "str(crabs)\n",
+ "head(crabs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "53940b5e4147ecc0a1d918a66012392c",
+ "grade": false,
+ "grade_id": "cell-3245a397a3c0a56f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Create a scatterplot of `n_males` versus carapace `width` (via `geom_point()`), even though `n_males` is not continuous. The `ggplot()` object's name will be `Crabs_vs_width_scatterplot`. Recall that the response must be placed on the $y$-axis, whereas the continuous input must be on the $x$-axis. Include proper axis labels and title.\n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4e3dcaff4a02b8b0c12a3587d988c936",
+ "grade": false,
+ "grade_id": "cell-267744b8584d386b",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Crabs_vs_width_scatterplot <- \n",
+ "# ... %>%\n",
+ "# ggplot() +\n",
+ "# ...(aes(..., ...)) +\n",
+ "# labs(y = ..., x = ...) +\n",
+ "# ggtitle(...) +\n",
+ "# theme(\n",
+ "# text = element_text(size = 16.5),\n",
+ "# plot.title = element_text(face = \"bold\"),\n",
+ "# axis.title = element_text(face = \"bold\"),\n",
+ "# legend.title = element_text(face = \"bold\")) + \n",
+ "# scale_x_continuous(breaks = seq(20, 34, 2))\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "Crabs_vs_width_scatterplot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "0ab5a2b196ff6c3c1ed3a2b30946f711",
+ "grade": true,
+ "grade_id": "cell-4ea95eb36fa39c91",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "6142c87aa6d0d6518e09687fd9fa8908",
+ "grade": false,
+ "grade_id": "cell-c3c5c48f76ee8a52",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "Note the characteristic horizontal pattern in the points of `Crabs_vs_width_scatterplot`, since the $y$-axis has repeated counts associated with different `width` values. Graphically speaking, is the variable the carapace `width` associated with `n_males`?\n",
+ "\n",
+ "From the `Crabs_vs_width_scatterplot` above, it is hard to graphically conclude something about the relationship between `n_males` and caparace `width`. Hence, let us plot the average `n_males` by non-overlapped carapace `width` groups. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "5033c071993b6ed60129dcf2bbd9196c",
+ "grade": false,
+ "grade_id": "cell-d8dc6bd50a1e51f1",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.2**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Create a data frame called `Crabs_group_avg_width`, which is created from `crabs` and has two columns:\n",
+ "\n",
+ "- `width_intervals`: a column created with column `width` via function `cut()` with `breaks = 10` (i.e., bins).\n",
+ "- `mean_n_males`: the average `n_males` by each bin.\n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "e6cfebcffc2b10cd7d5b47614f365c70",
+ "grade": false,
+ "grade_id": "cell-e136e294390e75db",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Crabs_group_avg_width <- \n",
+ "# ... %>%\n",
+ "# ...(width_intervals = ...(..., ...)) %>%\n",
+ "# group_by(...) %>% \n",
+ "# summarise(... = ...(...)) \n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "Crabs_group_avg_width"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "429c9f614730ce806230cd149ce6cd13",
+ "grade": true,
+ "grade_id": "cell-24bf5f18b3ee2717",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "5ab1e5a66cefd3b5a20e56692594c572",
+ "grade": false,
+ "grade_id": "cell-fcf23138e5f9e32f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.3**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Create another scatterplot of `mean_n_males` on the $y$-axis versus the carapace `width_intervals` on the $x$-axis using `Crabs_group_avg_width` with `geom_point()`. The `ggplot()` object's name will be `Crabs_avg_width_scatterplot`. Include proper axis labels and title.\n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "062821791d38686c9c2f8ec6bba4af44",
+ "grade": false,
+ "grade_id": "cell-72dcede930e14421",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Crabs_avg_width_scatterplot <- ... %>%\n",
+ "# ggplot() +\n",
+ "# ...(aes(..., ...), colour = \"red\", size = 4) +\n",
+ "# labs(y = ..., x = ...) +\n",
+ "# ggtitle(...) +\n",
+ "# theme(\n",
+ "# text = element_text(size = 14),\n",
+ "# plot.title = element_text(face = \"bold\"),\n",
+ "# axis.title = element_text(face = \"bold\"),\n",
+ "# legend.title = element_text(face = \"bold\"),\n",
+ "# axis.text.x = element_text(angle = 45, hjust = 1)\n",
+ "# )\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "options(repr.plot.width = 15, repr.plot.height = 7)\n",
+ "plot_grid(Crabs_vs_width_scatterplot, Crabs_avg_width_scatterplot, ncol = 2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "06d23e2fc7522963d9ad92d2bc5ceedc",
+ "grade": true,
+ "grade_id": "cell-c470e7f855f8f4c8",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a227e6065d0254d0e966f2b955bdde66",
+ "grade": false,
+ "grade_id": "cell-58f301f5cab01d1a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.4**\n",
+ "
{points: 1}\n",
+ "\n",
+ "By looking at `Crabs_avg_width_scatterplot`, graphically speaking, what is the relationship between `n_males` and carapace `width`?\n",
+ "\n",
+ "**A.** Positive.\n",
+ "\n",
+ "**B.** Negative.\n",
+ "\n",
+ "**C.** No relationship.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.4` (character type surrounded by quotes).*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d417bb44aa9ac1ad7ea23c927da2496f",
+ "grade": false,
+ "grade_id": "cell-5f4531deea0754da",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.4 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.4"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "3fe969a9a11de52d0812d9d263da9ee1",
+ "grade": true,
+ "grade_id": "cell-32224ab4e0d00a59",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ca474bd507ac23a98657c19cea770765",
+ "grade": false,
+ "grade_id": "cell-98e957ee7f6dfc9c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "If we compare `Crabs_vs_width_scatterplot` and `Crabs_avg_width_scatterplot`, we can see that working with the averages of `n_males` by carapace `width` intervals gives us a clearer perspective of the relationship between these two variables. Nonetheless, we need to find a suitable model to confirm this statistically.\n",
+ "\n",
+ "Recall that the residual component in an ordinary linear regression model, namely $\\varepsilon_i$, is assumed to be Normally distributed, making the response $Y_i$ Normally distributed. In this case, our response variable is the \"Number of male crabs\" (count!). Counts distributions can be asymmetric, and they are non-negative, thus the Normal distribution might not be adequate. Nor the logistic regression, since we are not estimating proportions. \n",
+ "\n",
+ "A very useful distribution to model counts (not the only one!) is the Poisson distribution. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "5790238897dcac0b0d03098c3cb3fa8f",
+ "grade": false,
+ "grade_id": "cell-d07f008bc4129b8f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**The Poisson Regression**\n",
+ "\n",
+ "A Poisson random variable takes discrete non-negative integer values that count something in a given timeframe or even in a space such as a geographic unit. A particularity of the Poisson distribution is that its mean is equal to its variance. Thus, any factor that affects the mean will also affect the variance. This fact could be a potential drawback for using a Poisson regression model.\n",
+ "\n",
+ "The Poisson regression model is given by:\n",
+ "\n",
+ "$$Y_i|\\mathbf{X}_i \\sim \\text{Poisson}(\\lambda_i),$$\n",
+ "\n",
+ "$$\\log(\\lambda_i) = \\beta_0 + \\beta_1X_{1,i} + \\ldots + \\beta_pX_{1,p}$$\n",
+ "\n",
+ "or equivalently,\n",
+ "\n",
+ "$$\\lambda_i = e^{\\beta_0 + \\beta_1X_{1,i} + \\ldots + \\beta_pX_{1,p}}$$\n",
+ "\n",
+ "where each variable has its own mean, $\\lambda_i$, and variance, also $\\lambda_i$. The parameter $\\lambda_i$ is interpreted as the risk of an event occurring in a given timeframe or even a space. Note that $\\lambda_i$ cannot be negative, hence it is a useful function to link $E[Y_i|\\mathbf{X}_i]=\\lambda_i$ with the the linear component."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e4b6ad031020360f52082c24f73240fe",
+ "grade": false,
+ "grade_id": "cell-27605b7706aedb71",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.5**\n",
+ "
{points: 1}\n",
+ "\n",
+ "For our `crabs` dataset the events are the number of male crabs, `n_males`, around a space: the female breeding nest. Suppose we want to make inference on whether the carapace `width` is related to the response `n_males`. Thus, we could use Poisson regression. Let $X_{\\texttt{width}_i}$ be the $i$th value for the input `width` in our dataset `crabs`. The model's regression equation will be:\n",
+ "\n",
+ "$$\\log(\\lambda_i) = \\beta_0 + \\beta_1 X_{\\texttt{width}_i}.$$\n",
+ "\n",
+ "Let us plot the predictions of this model on top of `Crabs_vs_width_scatterplot`. Use `geom_smooth()` with `method = \"glm\"` and `method.args = list(family = poisson)`.\n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "dcf2caa2b54bc8b29705e3acac978768",
+ "grade": false,
+ "grade_id": "cell-3a1e6d7e495e79d0",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Crabs_vs_width_scatterplot <- \n",
+ "# Crabs_vs_width_scatterplot +\n",
+ "# ...(\n",
+ "# aes(..., ...), \n",
+ "# ..., se = FALSE,\n",
+ "# ...)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "Crabs_vs_width_scatterplot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "419c75caf1de4a00fff5719224bc8cb6",
+ "grade": true,
+ "grade_id": "cell-e6f45c72012f6548",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.5()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "29bc289c41116e36fe2426e5055d88eb",
+ "grade": false,
+ "grade_id": "cell-04bbdae892ba5783",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "As seen in other models, the parameters $\\beta_0, \\beta_1, \\dots, \\beta_{p}$ are unknown population coefficients that we want to estimate using data. \n",
+ "\n",
+ "In order to fit a Poisson regression model, we can also use the function `glm()` and its argument `family = poisson` (required to specify the Poisson nature of the response), which obtains the estimates $\\hat{\\beta}_0, \\hat{\\beta}_1, \\dots \\hat{\\beta}_{p}$. The estimates are obtained through maximum likelihood where we assume a Poisson joint probability mass function of the $n$ responses $Y_i$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "7709c39efbfc26544edb44656db777bd",
+ "grade": false,
+ "grade_id": "cell-1cd6b2d8388adfad",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.6**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Using `glm()`, estimate a Poisson regression model with `n_males` as a response and two input variables: `width` ($X_{\\texttt{width}_i}$) and `color` ($X_{\\texttt{color_darker}_i}$, $X_{\\texttt{color_light}_i}$, and $X_{\\texttt{color_medium}_i}$) for the $i$th observation:\n",
+ "\n",
+ "$$\n",
+ "log(\\lambda_i) = \\beta_0 + \\beta_1 X_{\\texttt{width}_i} + \\beta_2 X_{\\texttt{color_darker}_i} + \\beta_3 X_{\\texttt{color_light}_i} + \\beta_4 X_{\\texttt{color_medium}_i}.\n",
+ "$$\n",
+ "\n",
+ "Note that the ordinal input `color` has four levels (where `dark` is the baseline): "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ef6561481dd7f355a99408c62d1dcd9a",
+ "grade": false,
+ "grade_id": "cell-a5c51d7457cd83a8",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "levels(crabs$color)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "9f11986d9d874da9b2c6d7a4a127d49d",
+ "grade": false,
+ "grade_id": "cell-3474c3e0f1563e34",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "Therefore, we have three dummy variables: $X_{\\texttt{color_darker}_i}$, $X_{\\texttt{color_light}_i}$, and $X_{\\texttt{color_medium}_i}$. Depending on the `color`, these dummy variables take on the following values:\n",
+ "\n",
+ "- When `color` is `darker`, then $X_{\\texttt{color_darker}_i} = 1$ while the other two dummy variables $X_{\\texttt{color_light}_i} = X_{\\texttt{color_medium}_i} = 0$.\n",
+ "- When `color` is `light`, then $X_{\\texttt{light}_i} = 1$ while the other two dummy variables $X_{\\texttt{color_darker}_i} = X_{\\texttt{color_medium}_i} = 0$.\n",
+ "- When `color` is `medium`, then $X_{\\texttt{medium}_i} = 1$ while the other two dummy variables $X_{\\texttt{color_darker}_i} = X_{\\texttt{color_light}_i} = 0$.\n",
+ "\n",
+ "Call the model `crabs_Poisson_model`.\n",
+ " \n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.* "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "0c23fb53ad786e0805a83daabc30eb8a",
+ "grade": false,
+ "grade_id": "cell-cce6669279499835",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# crabs_Poisson_model <- ...(\n",
+ "# ...,\n",
+ "# ...,\n",
+ "# ...\n",
+ "# )\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "summary(crabs_Poisson_model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "01ca38e188374d6711e79151a5fec892",
+ "grade": true,
+ "grade_id": "cell-97cc9b41beb464bf",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.6()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a275f0f6e422f602fbd0bcfa871d3289",
+ "grade": false,
+ "grade_id": "cell-b7f592bf64da6cf8",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.7**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Report the estimated coefficients, their standard errors, and corresponding $p$-values using `tidy()` with `crabs_Poisson_model`. Include the corresponding asymptotic 95% confidence intervals. Store the results in the variable `crabs_Poisson_model_results`.\n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b8ee3cebef5e66c1989656ce43ab8951",
+ "grade": false,
+ "grade_id": "cell-4deafdcc73d1ddc4",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# crabs_Poisson_model_results <- \n",
+ "# ...(..., conf.int = TRUE) %>%\n",
+ "# mutate_if(is.numeric, round, 3)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "crabs_Poisson_model_results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "338e86ff29326cb30c941ede1a46255c",
+ "grade": true,
+ "grade_id": "cell-441e381e29ca858b",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.7()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a9c205d86ae2d22d334014e8ae098f8b",
+ "grade": false,
+ "grade_id": "cell-f95b8a861b29b15b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.8**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Since the link function is logarithmic, we can also interpret the exponentiated coefficients. Add to `crabs_Poisson_model_results` the estimate, and 95% confidence interval, of $e^{\\beta_j}, j=0,...,p$. \n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4ae1de2b5efd7ca21e55ad9390c55f1b",
+ "grade": false,
+ "grade_id": "cell-faa353ab9211c114",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# crabs_Poisson_model_results <- \n",
+ "# crabs_Poisson_model_results %>%\n",
+ "# mutate(\n",
+ "# exp.estimate = ...,\n",
+ "# exp.conf.low = ...,\n",
+ "# exp.conf.high = ...) %>%\n",
+ "# mutate_if(is.numeric, round, 4)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "crabs_Poisson_model_results\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a615caefcc12fee9864b794e383c0a3d",
+ "grade": true,
+ "grade_id": "cell-1822a6990bcbbd0f",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.8()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "7a2c5ac3641b01bf800bd44b047d7fdc",
+ "grade": false,
+ "grade_id": "cell-d029850491a79223",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "Note that you can also get the exponentiated estimated coefficients using `tidy` when the link function is `log` or `logit`. Note that `std.error` and `statistic` are not adjusted and it does not report the raw coefficients."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "72d9f39879dae06280ccf6b1d8876395",
+ "grade": false,
+ "grade_id": "cell-d41aa0a8719a312b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "tidy(crabs_Poisson_model, exponentiate = FALSE,conf.int = TRUE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "44c3dd5dd57ecf3fa13ce816a7cac29e",
+ "grade": false,
+ "grade_id": "cell-87f78f36becf5ba3",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "tidy(crabs_Poisson_model, exponentiate = TRUE,conf.int = TRUE)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "2f9677618f8b5a521ccf7c8e681f0609",
+ "grade": false,
+ "grade_id": "cell-7c2691fb066e9a1e",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.9**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Using a **significance level $\\alpha = 0.05$**, and the output in `crabs_Poisson_model_results`, which of the following statements is TRUE?\n",
+ "\n",
+ "**A.** There's enough evidence to reject the null hypothesis that the coefficient of carapace `width` is zero.\n",
+ "\n",
+ "**B.** There's enough evidence to reject the null hypothesis that, for any width, the mean numbers of male crabs with `dark` and `darker` colors of the prosoma are equal.\n",
+ "\n",
+ "**C.** There's enough evidence to reject the null hypothesis that, for any width, the mean numbers of male crabs with `dark` and `light` colors of the prosoma are equal. \n",
+ "\n",
+ "**D.** There's enough evidence to reject the null hypothesis that, for any width, the mean numbers of male crabs with `dark` and `medium` colors of the prosoma are equal. \n",
+ "\n",
+ "*Assign your answers to the object `answer1.9`. Your answers have to be included in a single string indicating the correct options **in alphabetical order** and surrounded by quotes (e.g., `\"ABCD\"` indicates you are selecting the four options).*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d65d30b90231faecc44c6d528236c653",
+ "grade": false,
+ "grade_id": "cell-d1466876e62cbd8d",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.9 <- \n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.9"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9a60387f4dcfb40d6906afa52ea7a93e",
+ "grade": true,
+ "grade_id": "cell-c4b2bdbb07660385",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.9()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "2bfbb69a735c6be77b3a76f62ea9b164",
+ "grade": false,
+ "grade_id": "cell-3cce4716202a3ac2",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Interpretation of estimated coefficients for continuous covariates** \n",
+ "\n",
+ "Firstly, let us focus on the coefficient interpretation corresponding to carapace `width`, *while keeping `color` constant*. Consider an observation with a given value $X_{\\texttt{width}} = \\texttt{w}$ cm, and another observation with a given $X_{\\texttt{width + 1}} = \\texttt{w} + 1$ cm (i.e., an increase of $1$ cm). Then we have their corresponding regression equations:\n",
+ "\n",
+ "$$\n",
+ "\\log \\lambda_{\\texttt{width}} = \\beta_0 + \\beta_1 \\overbrace{\\texttt{w}}^{X_{\\texttt{width}}} + \\overbrace{\\beta_2 X_{\\texttt{color_darker}} + \\beta_3 X_{\\texttt{color_light}} + \\beta_4 X_{\\texttt{color_medium}}}^{\\text{Constant}}\n",
+ "$$\n",
+ "$$\n",
+ "\\log \\lambda_{\\texttt{width + 1}} = \\beta_0 + \\beta_1 \\underbrace{(\\texttt{w} + 1)}_{X_{\\texttt{width + 1}}} + \\underbrace{\\beta_2 X_{\\texttt{color_darker}} + \\beta_3 X_{\\texttt{color_light}} + \\beta_4 X_{\\texttt{color_medium}}.}_{\\text{Constant}}\n",
+ "$$\n",
+ "\n",
+ "We take the difference between both equations as:\n",
+ "\n",
+ "\\begin{align*}\n",
+ "\\log \\lambda_{\\texttt{width + 1}} - \\log \\lambda_{\\texttt{width}} &= \\beta_1 (\\texttt{w} + 1) - \\beta_1 \\texttt{w} \\\\\n",
+ "&= \\beta_1.\n",
+ "\\end{align*}\n",
+ "\n",
+ "Then, we apply the logarithm property for a ratio:\n",
+ "\n",
+ "\\begin{align*}\n",
+ "\\log \\frac{\\lambda_{\\texttt{width + 1}} }{\\lambda_{\\texttt{width}}} &= \\log \\lambda_{\\texttt{width + 1}} - \\log \\lambda_{\\texttt{width}} \\\\\n",
+ "&= \\beta_1.\n",
+ "\\end{align*}\n",
+ "\n",
+ "Finally, we have to exponentiate the previous equation:\n",
+ "\n",
+ "$$\n",
+ "\\frac{\\lambda_{\\texttt{width + 1}} }{\\lambda_{\\texttt{width}}} = e^{\\beta_1}.\n",
+ "$$\n",
+ "\n",
+ "This expression indicates that the mean count varies in a multiplicative way when a continuous covariate increases by 1 unit, i.e., $\\lambda_{\\texttt{width + 1}}= e^{\\beta_1}\\lambda_{\\texttt{width}}$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d18dc4097eb487b67385f4107bf2cafc",
+ "grade": false,
+ "grade_id": "cell-30680ad0b12b005a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.10**\n",
+ "
{points: 1}\n",
+ "\n",
+ "**Using the column `exp.estimate` from `crabs_Poisson_model_results`**, what is the correct interpretation of the regression equation's estimated slope for `width`?\n",
+ "\n",
+ "**A.** The mean count of male crabs (`n_males`) around a female breeding nest decreases by $161\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n",
+ "\n",
+ "**B.** The mean count of male crabs (`n_males`) around a female breeding nest increases by $161\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n",
+ "\n",
+ "**C.** The mean count of male crabs (`n_males`) around a female breeding nest increases by $16.1\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n",
+ "\n",
+ "**D.** The mean count of male crabs (`n_males`) around a female breeding nest decreases by $16.1\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n",
+ "\n",
+ "**E.** The mean count of *dark* male crabs (`n_males`) around a female breeding nest increases by $161\\%$ when increasing the carapace `width` by $1$ cm.\n",
+ "\n",
+ "**F.** The mean count of *dark* male crabs (`n_males`) around a female breeding nest increases by $16.1\\%$ when increasing the carapace `width` by $1$ cm.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.10`. Your answer should be one of `\"A\"`, `\"B\"`, `\"C\"`, `\"D\"`, `\"E\"`, or `\"F\"` surrounded by quotes.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "52bf049475f2209a74c35c60713d28d2",
+ "grade": false,
+ "grade_id": "cell-813f996317ff61dd",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.10 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f7233916807a2fc8edae2069861a8ade",
+ "grade": true,
+ "grade_id": "cell-01b57f216ff8f262",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.10()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f52d9e399a01697d185382fab3d2c8e2",
+ "grade": false,
+ "grade_id": "cell-01590258a807acee",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Interpretation of estimated coefficients of dummy variables** \n",
+ "\n",
+ "*Keeping `width` constant*, at any value (recall assumption of additive models!) consider an observation from the `darker` group: \n",
+ "\n",
+ "Then we have their corresponding regression equations:\n",
+ "\n",
+ "$$\n",
+ "\\log \\lambda_{\\texttt{width}} = \\beta_0 + \\beta_1 \\overbrace{\\texttt{w}}^{\\text{Constant}} + \\beta_2 X_{\\texttt{color_darker}} + \\beta_3 X_{\\texttt{color_light}} + \\beta_4 X_{\\texttt{color_medium}}\n",
+ "$$\n",
+ "\n",
+ "$$\n",
+ "\\log \\lambda_{\\texttt{width,dark}} = \\beta_0 + \\beta_1 \\texttt{w} + \\beta_2 \\times 0 + \\beta_3 \\times 0 + \\beta_4 \\times 0 \n",
+ "$$\n",
+ "\n",
+ "$$\n",
+ "\\log \\lambda_{\\texttt{width,darker}} = \\beta_0 + \\beta_1 \\texttt{w} + \\beta_2 \\times 1 + \\beta_3 \\times 0 + \\beta_4 \\times 0.\n",
+ "$$\n",
+ "\n",
+ "We take the difference between both equations as:\n",
+ "\n",
+ "\\begin{align*}\n",
+ "\\log \\lambda_{\\texttt{width,darker}} - \\log \\lambda_{\\texttt{width,dark}} = \\beta_2.\n",
+ "\\end{align*}\n",
+ "\n",
+ "Then, we apply the logarithm property for a ratio:\n",
+ "\n",
+ "\\begin{align*}\n",
+ "\\log \\frac{\\lambda_{\\texttt{width,darker}} }{\\lambda_{\\texttt{width,dark}}} &= \\log \\lambda_{\\texttt{width,darker}} - \\log \\lambda_{\\texttt{width,dark}} \\\\\n",
+ "&= \\beta_2.\n",
+ "\\end{align*}\n",
+ "\n",
+ "Finally, we have to exponentiate the previous equation:\n",
+ "\n",
+ "$$\n",
+ "\\frac{\\lambda_{\\texttt{width,darker}} }{\\lambda_{\\texttt{width,dark}}} = e^{\\beta_2}.\n",
+ "$$\n",
+ "\n",
+ "The expression $\\frac{\\lambda_{\\texttt{width,darker}} }{\\lambda_{\\texttt{width,dark}}} = e^{\\beta_2}$ indicates that the mean count changes in a multiplicative way between the two groups."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "2730f7c988816ff03732f5225d70d2eb",
+ "grade": false,
+ "grade_id": "cell-f8af88cea66668b0",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.11**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Let us move on to the interpretation of the coefficient corresponding to `light` from `color` (with reference level `dark`).\n",
+ "\n",
+ "Using the `crabs_Poisson_model_results` tibble, what is the correct interpretation of the regression estimated coefficient for the dummy variable `light`?\n",
+ "\n",
+ "**A.** The mean count of male crabs (`n_males`) around a female breeding nest is $54.7\\%$ lower in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n",
+ "\n",
+ "**B.** The mean count of male crabs (`n_males`) around a female breeding nest $54.7\\%$ higher in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n",
+ "\n",
+ "**C.** The mean count of male crabs (`n_males`) around a female breeding nest $154.7\\%$ higher in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n",
+ "\n",
+ "**D.** The mean count of male crabs (`n_males`) around a female breeding nest $154.7\\%$ lower in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n",
+ "\n",
+ "**E.** The mean count of male crabs (`n_males`) around a female breeding nest is $54.7\\%$ lower in the `dark` prosoma group compared to `light` group, *while keeping the carapace `width` constant (for any width value).*\n",
+ "\n",
+ "**F.** The mean count of male crabs (`n_males`) around a female breeding nest $54.7\\%$ higher in the `dark` prosoma group compared to `light` group, *while keeping the carapace `width` constant (for any width value).*\n",
+ "\n",
+ "*Assign your answer to the object `answer1.11`. Your answer should be one of `\"A\"`, `\"B\"`, `\"C\"`, `\"D\"`, `\"E\"`, or `\"F\"` surrounded by quotes.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ff760b89269d4912c27982a3fa0e2af6",
+ "grade": false,
+ "grade_id": "cell-9dd1384a6af3a36a",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.11 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.11"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d9edace5350e6c726a32663d2322356c",
+ "grade": true,
+ "grade_id": "cell-27d30eed0b7b8bae",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.11()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "0ebd56f5239783cd05b4b3d5763e2d14",
+ "grade": false,
+ "grade_id": "cell-96565a64571f2a45",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.12**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose we want to predict the mean count of male crabs (`n_males`) around a female breeding nest with a carapace `width` of $27.5$ cm and a `light` `color` of the prosoma. Then, using the function `predict()` with the object `crabs_Poisson_model`, obtain the corresponding prediction.\n",
+ "\n",
+ "> **Hint:** Check the argument `type` when coding this prediction.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.12`. Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "e495e0ababf6655472414e9b0822ac7a",
+ "grade": false,
+ "grade_id": "cell-5f4ccaec6bf73492",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.12 <- \n",
+ "# ...(...,\n",
+ "# tibble(..., ..., ...),\n",
+ "# type = ...\n",
+ "# )\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.12"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "81c218f35362c5b32755ab5d436812c3",
+ "grade": true,
+ "grade_id": "cell-0df8c510b063bb57",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.12()"
+ ]
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "formats": "ipynb,Rmd"
+ },
+ "kernelspec": {
+ "display_name": "R",
+ "language": "R",
+ "name": "ir"
+ },
+ "language_info": {
+ "codemirror_mode": "r",
+ "file_extension": ".r",
+ "mimetype": "text/x-r-source",
+ "name": "R",
+ "pygments_lexer": "r",
+ "version": "4.2.3"
+ },
+ "latex_envs": {
+ "LaTeX_envs_menu_present": true,
+ "autoclose": false,
+ "autocomplete": true,
+ "bibliofile": "biblio.bib",
+ "cite_by": "apalike",
+ "current_citInitial": 1,
+ "eqLabelWithNumbers": true,
+ "eqNumInitial": 1,
+ "hotkeys": {
+ "equation": "Ctrl-E",
+ "itemize": "Ctrl-I"
+ },
+ "labels_anchors": false,
+ "latex_user_defs": false,
+ "report_style_numbering": false,
+ "user_envs_cfg": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/materials/worksheet_12/tests_worksheet_12.R b/materials/worksheet_12/tests_worksheet_12.R
new file mode 100644
index 0000000..0e4a4f1
--- /dev/null
+++ b/materials/worksheet_12/tests_worksheet_12.R
@@ -0,0 +1,357 @@
+library(digest)
+library(testthat)
+
+test_1.0 <- function() {
+ test_that('Did not assign answer to an object called "default.df"', {
+ expect_true(exists("default.df"))
+ })
+
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(default.df))
+ })
+
+ expected_colnames <- c("default", "student", "balance", "income")
+ given_colnames <- colnames(default.df)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(default.df))), "f1a30baa3072c1aad822c059f35c6841")
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(default.df$balance))), "fe77e9bf661be6a426411dc60fe6b6cd")
+ })
+
+ print("Success!")
+}
+
+test_1.1 <- function() {
+ test_that('Did not assign answer to an object called "Default_SLR_plot"', {
+ expect_true(exists("Default_SLR_plot"))
+ })
+
+ test_that("Solution should be a ggplot object", {
+ expect_true(is.ggplot(Default_SLR_plot))
+ })
+
+ properties <- c(Default_SLR_plot$layers[[1]]$mapping, Default_SLR_plot$mapping)
+
+ test_that("Plot should have balance on the x-axis", {
+ expect_true("balance" == rlang::get_expr(properties$x))
+ })
+
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomPoint" %in% class(Default_SLR_plot$layers[[1]]$geom))
+ })
+
+
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(Default_SLR_plot$data)), "f1a30baa3072c1aad822c059f35c6841")
+ expect_equal(digest(round(sum(Default_SLR_plot$data$balance))), "3267136924eba9966e7fa0114b916d95")
+ })
+
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(Default_SLR_plot$labels$x == toString(rlang::get_expr(properties$x)))
+ })
+
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(Default_SLR_plot$labels))
+ })
+
+ print("Success!")
+}
+
+test_1.2 <- function() {
+ test_that('Did not assign answer to an object called "logistic_curve"', {
+ expect_true(exists("logistic_curve"))
+ })
+
+ test_that("Solution should be a ggplot object", {
+ expect_true(is.ggplot(logistic_curve))
+ })
+
+ properties <- c(logistic_curve$layers[[1]]$mapping, logistic_curve$mapping)
+
+ test_that("Plot should have z on the x-axis", {
+ expect_true("z" == rlang::get_expr(properties$x))
+ })
+
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomLine" %in% class(logistic_curve$layers[[1]]$geom))
+
+ # Remove if not needed:
+ # expect_true("GeomVline" %in% class(logistic_curve$layers[[2]]$geom))
+ })
+
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(logistic_curve$data)), "cada6c6b62b103dca67f23bd7d60ac3c")
+ expect_equal(digest(round(sum(logistic_curve$data$z))), "908d1fd10b357ed0ceaaec823abf81bc")
+ })
+
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(logistic_curve$labels))
+ })
+
+ print("Success!")
+}
+
+test_1.3 <- function() {
+ test_that('Did not assign answer to an object called "answer1.3"', {
+ expect_true(exists("answer1.3"))
+ })
+
+ answer_as_numeric <- as.numeric(answer1.3)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "9ce8f1f78c56eccda919dc6404297331")
+ })
+
+ print("Success!")
+}
+
+test_1.4 <- function() {
+ test_that('Did not assign answer to an object called "Default_SLR_plot"', {
+ expect_true(exists("Default_SLR_plot"))
+ })
+
+ test_that("Solution should be a ggplot object", {
+ expect_true(is.ggplot(Default_SLR_plot))
+ })
+
+ properties <- c(Default_SLR_plot$layers[[1]]$mapping, Default_SLR_plot$mapping)
+
+ test_that("Plot should have balance on the x-axis", {
+ expect_true("balance" == rlang::get_expr(properties$x))
+ })
+
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomPoint" %in% class(Default_SLR_plot$layers[[1]]$geom))
+ })
+
+
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(Default_SLR_plot$data)), "f1a30baa3072c1aad822c059f35c6841")
+ expect_equal(digest(round(sum(Default_SLR_plot$data$balance))), "3267136924eba9966e7fa0114b916d95")
+ })
+
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(Default_SLR_plot$labels$x == toString(rlang::get_expr(properties$x)))
+ })
+
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(Default_SLR_plot$labels))
+ })
+
+ print("Success!")
+}
+
+#-----
+test_1.5 <- function() {
+ test_that('Did not assign answer to an object called "Default_binary_log_student"', {
+ expect_true(exists("Default_binary_log_student"))
+ })
+
+ test_that("Solution should be a glm object", {
+ expect_true("glm" %in% class(Default_binary_log_student))
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(Default_binary_log_student$residuals) * 10e4)),
+ "1473d70e5646a26de3c52aa1abd85b1f")
+ })
+
+ print("Success!")
+}
+
+test_1.6 <- function() {
+ test_that('Did not assign answer to an object called "answer1.6"', {
+ expect_true(exists("answer1.6"))
+ })
+
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer1.6, "a|b|c|d", ignore.case = TRUE)
+ })
+
+ answer_hash <- digest(tolower(answer1.6))
+
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "ddf100612805359cd81fdc5ce3b9fbba")
+ })
+
+ print("Success!")
+}
+
+
+test_1.7 <- function() {
+ test_that('Did not assign answer to an object called "Default_binary_log_model"', {
+ expect_true(exists("Default_binary_log_model"))
+ })
+
+ test_that("Solution should be a glm object", {
+ expect_true("glm" %in% class(Default_binary_log_model))
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(Default_binary_log_model$residuals) * 10e4)), "4cd0d70c6d524273008afa4703cd9df0")
+ })
+
+ print("Success!")
+}
+
+
+test_1.8 <- function() {
+ test_that('Did not assign answer to an object called "answer1.8"', {
+ expect_true(exists("answer1.8"))
+ })
+
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer1.8, "a|b|c|d", ignore.case = TRUE)
+ })
+
+ answer_hash <- digest(tolower(answer1.8))
+
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "6e7a8c1c098e8817e3df3fd1b21149d1")
+ })
+
+ print("Success!")
+}
+
+test_1.9 <- function() {
+ test_that('Did not assign answer to an object called "answer1.9"', {
+ expect_true(exists("answer1.9"))
+ })
+
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer1.9, "a|b|c|d", ignore.case = TRUE)
+ })
+
+ answer_hash <- digest(tolower(answer1.9))
+
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "127a2ec00989b9f7faf671ed470be7f8")
+ })
+
+ print("Success!")
+}
+
+test_1.10 <- function() {
+ test_that('Did not assign answer to an object called "Default_binary_log_model_results"', {
+ expect_true(exists("Default_binary_log_model_results"))
+ })
+
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(Default_binary_log_model_results))
+ })
+
+ expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high')
+
+
+ given_colnames <- colnames(Default_binary_log_model_results)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(Default_binary_log_model_results))), "234a2a5581872457b9fe1187d1616b13")
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(Default_binary_log_model_results$conf.low) * 10e6)), "ba24456096c2af163742607b3e494ef7")
+ expect_equal(digest(as.integer(sum(Default_binary_log_model_results$conf.high) * 10e6)), "f2c22cf099c18a563f722da22c975d80")
+ expect_equal(digest(as.integer(sum(Default_binary_log_model_results$statistic) * 10e6)), "273c6d0ba411836667be62df7641a7b4")
+ })
+
+ print("Success!")
+}
+
+test_1.11 <- function() {
+ test_that('Did not assign answer to an object called "Default_binary_odds_model_results"', {
+ expect_true(exists("Default_binary_odds_model_results"))
+ })
+
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(Default_binary_odds_model_results))
+ })
+
+ expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high')
+
+ given_colnames <- colnames(Default_binary_odds_model_results)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(Default_binary_odds_model_results))), "234a2a5581872457b9fe1187d1616b13")
+ })
+
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(Default_binary_odds_model_results$estimate) * 10e6)), "7a4bb9d19684e603bffc6f1961a7eda1")
+ expect_equal(digest(as.integer(sum(Default_binary_odds_model_results$conf.low) * 10e6)), "55679ab470e07149e5a38ce2213a15df")
+ expect_equal(digest(as.integer(sum(Default_binary_odds_model_results$conf.high) * 10e6)), "bdccf6329a8314620c8305cc31634021")
+ })
+
+ print("Success!")
+}
+
+test_1.12 <- function() {
+ test_that('Did not assign answer to an object called "answer1.12"', {
+ expect_true(exists("answer1.12"))
+ })
+
+ answer_hash <- digest(tolower(answer1.12))
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "8b63e49106226d2a45958a7e24c97c37")
+ })
+
+ print("Success!")
+}
+
+
+test_1.13 <- function() {
+ test_that('Did not assign answer to an object called "answer1.13"', {
+ expect_true(exists("answer1.13"))
+ })
+
+ answer_as_numeric <- as.numeric(answer1.13)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "7ea6059985f925469f09bb3af79c7dc1")
+ })
+
+ print("Success!")
+}
+
+test_1.14 <- function() {
+ test_that('Did not assign answer to an object called "answer1.14"', {
+ expect_true(exists("answer1.14"))
+ })
+
+ answer_as_numeric <- as.numeric(answer1.14)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "26b7f981cf15e1b5f55fa6a141d8a3a0")
+ })
+
+ print("Success!")
+}
diff --git a/materials/worksheet_12/worksheet_12.ipynb b/materials/worksheet_12/worksheet_12.ipynb
new file mode 100644
index 0000000..3e3073f
--- /dev/null
+++ b/materials/worksheet_12/worksheet_12.ipynb
@@ -0,0 +1,2137 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "43f2a4dab79483c826b08ed3405ba5fb",
+ "grade": false,
+ "grade_id": "cell-f1e1d845873036f4",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "# Worksheet 12: Binary or Discrete Counts Responses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "338d963ee69f8faedab69db0f6d25bdb",
+ "grade": false,
+ "grade_id": "cell-82d9926086d47a80",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### Lecture and Tutorial Learning Goals:\n",
+ "After completing this week's lecture and tutorial work, you will be able to:\n",
+ "\n",
+ "1. Describe the logistic regression estimation procedure (binary response variable), and Poisson regression estimation procedure (discrete counts as the response variable).\n",
+ "2. Discuss the relationship between linear regression and logistic and Poisson regression. Discuss the consequences of modeling data that is more suitable for logistic and Poisson regression models as a linear regression model.\n",
+ "3. Interpret the coefficients and $p$-values in the logistic and Poisson regression settings.\n",
+ "4. Write a computer script to perform logistic and Poisson regression and perform model diagnostics. Interpret and communicate the results from that computer script."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "087c055e30ff16cf47ce67825099376e",
+ "grade": false,
+ "grade_id": "cell-a2a153352bc44a68",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "library(tidyverse)\n",
+ "library(repr)\n",
+ "library(digest)\n",
+ "library(infer)\n",
+ "library(gridExtra)\n",
+ "library(mlbench)\n",
+ "library(AER)\n",
+ "library(ISLR)\n",
+ "library(broom)\n",
+ "library(qqplotr)\n",
+ "library(performance)\n",
+ "library(see)\n",
+ "library(MASS)\n",
+ "library(glmbb)\n",
+ "library(cowplot)\n",
+ "source(\"tests_worksheet_12.R\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ea100688ea767bf0d06fb72cc460f9d8",
+ "grade": false,
+ "grade_id": "cell-82e529f0a9216400",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 1. Intro\n",
+ "\n",
+ "In previous week, you explored the Multiple Linear Regression (MLR) as a way to model the mean of a numeric response variable, $Y$, given a set of covariate $\\mathbf{X}$:\n",
+ "\n",
+ "$$\n",
+ "E\\left[Y\\left|\\mathbf{X}=\\left(X_1,...,X_p\\right)\\right.\\right] = \\beta_0 + \\beta_1X_1 + \\ldots + \\beta_pX_p\n",
+ "$$\n",
+ "\n",
+ "However, in some situations, the MLR is not suitable. This week we are going to study two of those situations that commonly arises in practice:\n",
+ "\n",
+ "- **in this worksheet**: the case of dichotomous response variables (e.g., yes/no, success/failure, win/lose, sick/not sick) \n",
+ "- **in the tutorial**: the case of response variables representing counts (e.g., number of cases of a rare disease in Vancouver in a period of one year; the number of accidents in the Canada Highway in a period of one month)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "bd123337ed378d509d674d57afb45d0e",
+ "grade": false,
+ "grade_id": "cell-9be71f65643c5906",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 1. Logistic Regression\n",
+ "\n",
+ "Logistic Regressions are commonly used to model the probability of an event against a set of observed covariates.\n",
+ "\n",
+ "As learned for Linear Regression, Logistic Regression can be used \n",
+ "\n",
+ "- to **estimate and test** the true relation between different type of variables and a **binary response**\n",
+ "\n",
+ "\n",
+ "- to **predict** the *probability* of a **binary response** (aka, classifier)\n",
+ "\n",
+ "**For example**: we can use a logistic regression to\n",
+ "\n",
+ "- compare the presence of a bacteria between groups taking a new drug and a placebo, respectively.\n",
+ " - **response**: `present` or `not present`\n",
+ "\n",
+ "- predict whether or not a customer will default on a loan given their income and demographic variables.\n",
+ " - **response**: `default` or `not default`\n",
+ "\n",
+ "- know how GPA, ACT score, and number of AP classes taken is associated with the probability of getting accepted into a particular university.\n",
+ " - **response**: `accepted` or `not accepted`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a73772cca747b2773c4a701080994e04",
+ "grade": false,
+ "grade_id": "cell-f1fa818e59b35d73",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### The response variable:\n",
+ "\n",
+ "In many applications, the response variable will not be a numeric vector in the dataset. In those cases, we need to create a new numerical response variable to fit a logistic model.\n",
+ "> the function in R to fit a logistic regression requires a numerical response and won't create it for us!\n",
+ "\n",
+ "Mathematically, we have to construct a binary response $Y_i$ that represents the \"proportion\" of sucess (S) for a given event of interest: \n",
+ "\n",
+ "$$\n",
+ "Y_i =\n",
+ "\\begin{cases}\n",
+ "1 \\; \\; \\; \\; \\mbox{if the $i$th observation is S},\\\\\n",
+ "0 \\; \\; \\; \\; \t\\mbox{otherwise.}\n",
+ "\\end{cases}\n",
+ "$$\n",
+ "\n",
+ "In the examples above, we would set:\n",
+ "\n",
+ "- $Y_i = 1$ if the bacteria is present in the blood sample of the $i$th patient \n",
+ "\n",
+ "\n",
+ "- $Y_i = 1$ if the $i$th customer defaulted on their loan\n",
+ "\n",
+ "\n",
+ "- $Y_i = 1$ if the $i$th student was accepted to a particular university\n",
+ "\n",
+ "\n",
+ "In statistics, we refer to each of cases as a **Bernoulli** trial with a $p_i$ probability of success, i.e., \n",
+ "\n",
+ "$$Y_i \\sim \\text{Bernoulli}(p_i)$$\n",
+ "\n",
+ "where \n",
+ "\n",
+ "$$\n",
+ "E(Y_i) = p_i\n",
+ "$$"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "7630360deb8048c895326b3305a2de58",
+ "grade": false,
+ "grade_id": "cell-7aad649d27ed77eb",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "In this worksheet will focus on the dataset `Default` from [*An Introduction to Statistical Learning*](https://www.statlearning.com/) (James et al., 2013). This is a dataset of $n = 10,000$ observations with the following variables:\n",
+ "\n",
+ "- `default`: a binary response indicating whether the customer defaulted on their debt (`Yes` or `No`).\n",
+ "- `student`: a binary input variable indicating whether the customer is a student (`Yes` or `No`).\n",
+ "- `balance`: a continuous input variable indicating the remaining average balance of the customer's credit card (after the monthly payment).\n",
+ "- `income`: a continuous input variable indicating the customer's income."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b96b0bf2d2dc5fe1c06048f774f68692",
+ "grade": false,
+ "grade_id": "cell-be65b9244ff2e164",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "head(Default)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ee19e16c1430b3529cead015211b6e38",
+ "grade": false,
+ "grade_id": "cell-435c6ac14d8ae3f6",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.0**\n",
+ "
{points: 1}\n",
+ "\n",
+ "We want to assess whether the response `default` is statistically associated to the input variables `student`, `balance`, and `income`. \n",
+ "\n",
+ "We need to construct a response $Y_i$ such that $Y_i = 1$ if the $i$th customer is in default, and $0$ otherwise.\n",
+ "\n",
+ "In the `default` column, replace the levels `Yes` and `No` with the numerical values `1` and `0`, respectively.\n",
+ "\n",
+ "**Note**: the response $y$ is an interger, *not* a factor, since we need R to interpret them as sample proportions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "744d7894a687b060c55550bcc554dcf4",
+ "grade": false,
+ "grade_id": "cell-f583c10048ebd9d7",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# default.df <- \n",
+ "# Default %>% \n",
+ "# ...(... = ...(..., 1, 0))\n",
+ "\n",
+ "#head(default.df)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "head(default.df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4f4480133999e39828e209691ad87b1d",
+ "grade": true,
+ "grade_id": "cell-80e35ff00a0d8926",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "b2be64412944109133d7ea2c01c1893b",
+ "grade": false,
+ "grade_id": "cell-52cf22804e4f9008",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### The conditional expectation \n",
+ "\n",
+ "We have seen before that a MLR\n",
+ "\n",
+ "In this first exercise, we use a simple linear regression (SLR), as we have done before, to estimate the relation between `balance` and the response `default`.\n",
+ "\n",
+ "As we previously recalled, a SLR will model: \n",
+ "\n",
+ "$$\n",
+ "E\\left[Y_\\texttt{default}|X_\\texttt{balance}\\right] = \\beta_0 + \\beta_1 X_\\texttt{balance}\n",
+ "$$\n",
+ "\n",
+ "However, since `default` is a binary variable we also know that its expectation equals the probability of default:\n",
+ "\n",
+ "$$\n",
+ "E\\left[Y_\\texttt{default}|X_\\texttt{balance}\\right] = p_{\\texttt{default}}\n",
+ "$$\n",
+ "\n",
+ "> for simplicity, the subscript $i$ is omitted\n",
+ "\n",
+ "Let's examine if there is a problem using a linear regression to model this conditional expectation. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "dd379fb4c76ebda289ac9c49500544ed",
+ "grade": false,
+ "grade_id": "cell-ec0e34aef0968060",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Create a plot of the data (using `geom_point()`) along with the estimated regression line (using `geom_smooth()` with `method = \"lm\"`). Include proper axis labels. The `ggplot()` object's name will be `Default_SLR_plot`.\n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b4c8ae8575b46169fd561af4abec1c3d",
+ "grade": false,
+ "grade_id": "cell-beeb841f20cbe0b7",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "options(repr.plot.width = 7, repr.plot.height = 5) # Adjust these numbers so the plot looks good in your desktop.\n",
+ "\n",
+ "# Default_SLR_plot <- ...(...) +\n",
+ "# ...(aes(..., ...)) +\n",
+ "# ...(aes(..., ...), method = ..., se = FALSE) +\n",
+ "# labs(y = ..., x = ...) +\n",
+ "# ggtitle(...) +\n",
+ "# ylim(-0.5, 1.5) +\n",
+ "# theme(\n",
+ "# text = element_text(size = 16.5),\n",
+ "# plot.title = element_text(face = \"bold\"),\n",
+ "# axis.title = element_text(face = \"bold\"),\n",
+ "# legend.title = element_text(face = \"bold\")) +\n",
+ "# scale_x_continuous(breaks = seq(0, 2500, 500))\n",
+ "\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "Default_SLR_plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "517a6cdb14fdc0d88ff2ae2218bd6f20",
+ "grade": true,
+ "grade_id": "cell-5f01635dd4cf6b6b",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "de8210a0fb82dc1a06e33e8cbb2325d5",
+ "grade": false,
+ "grade_id": "cell-a0c60c4720d6d478",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Class Discussion:** \n",
+ "\n",
+ "Do you see any problems with our model? "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "bd5a09693ac165ea8c55941e878670aa",
+ "grade": false,
+ "grade_id": "cell-e2bdf22d86042c11",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### Logistic Regression: an alternative to LR\n",
+ "\n",
+ "The problem stems from using the *linear* model to estimate a probability! \n",
+ "\n",
+ "Mathematically, the linear component $\\boldsymbol{X_i}^\\top\\boldsymbol{\\beta}$ can take any value in $\\mathbb{R}$ while $p_i$ has to be in [0, 1].\n",
+ "\n",
+ "A natural way to solve this problem is to use a curve, instead of a line, with a range between $[0,1]$. One of such curves is the logistic curve:\n",
+ "\n",
+ "$$E(Y_i|X_{\\textit{i,balance}}) = p_i = \\frac{e^{\\beta_0 + \\beta_1X_{\\textit{i,balance}}}}{1+e^{\\beta_0 + \\beta_1X_{\\textit{i,balance}}}}$$ \n",
+ "\n",
+ "**Note** that we are still using a linear component but not to directly model the conditional expectation. With some algebra, we can show that:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "\\log\\left(\\frac{p_i}{1 - p_i}\\right) = \\beta_0 + \\beta_1 X_{i,\\texttt{balance}}.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "**Definition**: $p_i$ to $1 - p_i$ are also known as the **odds** and can be estimated by *number of sucesses* to *number of failures*\n",
+ "\n",
+ "For example, among 7056 non-student customers, 206 defaulted on their debt. Thus, the odds that non-student customers default is 206 to 7056. \n",
+ "\n",
+ "The function $\\log\\left(\\frac{p_i}{1 - p_i}\\right)$ is called `logit`, and it is *logarithm of the odds*. \n",
+ "\n",
+ "Let's explore a bit this new function."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "c9d23e8446f38afabaefba22f7523158",
+ "grade": false,
+ "grade_id": "cell-33c529c35aa6d29d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.2**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Let's see how the logistic curve looks like. In this exercise, you are going to plot the logistic curve to see how it behaves.\n",
+ "\n",
+ "_Save the plot in an object named `logistic_curve`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a9065d3e33d8b03430b7e82093470e67",
+ "grade": false,
+ "grade_id": "cell-ebd9f096f835b1b0",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# logistic_curve <-\n",
+ "# tibble(z = seq(-10,10,0.01),\n",
+ "# logistic_z = ...) %>% \n",
+ "# ggplot(aes(z, ...)) + \n",
+ "# geom_line() +\n",
+ "# geom_hline(yintercept = 1, lty=2) + \n",
+ "# geom_hline(yintercept = 0, lty=2) +\n",
+ "# theme(text = element_text(size = 20)) + \n",
+ "# ggtitle(\"Logistic curve\")\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "logistic_curve"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ecf65b1c668b9efa77e328b0933067be",
+ "grade": true,
+ "grade_id": "cell-7728e71dc711badb",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "71579f9316adb6afbf36f0e331550237",
+ "grade": false,
+ "grade_id": "cell-1f6a141622c13896",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.3: Understanding the odds**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Vancouver Canucks is playing against Calgary Flames in the Final of NHL. The match will be at Rogers' arena, Canucks home. It is expected that out of 18,910 seats in the arena, 13700 seats will be occupied by Canucks fans. During the match, prizes are randomly distributed among the seats. What are the odds that a Canucks fan wins a given prize? \n",
+ "\n",
+ "Assign your answer to an object named `answer1.3`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "836ce27b9d3fa4140dea20c3ceb8bb8f",
+ "grade": false,
+ "grade_id": "cell-ee8b9ad3125f8346",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#answer1.3 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "15f77319d1592683114eb33627f52723",
+ "grade": true,
+ "grade_id": "cell-85f66b5726146ab1",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "636357e23eb20f894aa0814c62f33a38",
+ "grade": false,
+ "grade_id": "cell-8a7c7401c453d69c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.4:**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Let us plot the predictions of the binary logistic regression model on top of `Default_SLR_plot`. Use `geom_smooth()` with `method = \"glm\"` and `method.args = c(family = binomial)`.\n",
+ "\n",
+ "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c6c6408783c4d50cd85a3d3497d72a2b",
+ "grade": false,
+ "grade_id": "cell-47acd27ded854e48",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Default_SLR_plot <- \n",
+ "# Default_SLR_plot +\n",
+ "# ...(aes(..., ...),\n",
+ "# method = ...,\n",
+ "# method.args = ..., \n",
+ "# se = FALSE, color = \"red\") +\n",
+ "# ylim(-0.5,1.5) +\n",
+ "# ggtitle(\"Simple Linear Regression and Binary Logistic Regression\")\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "Default_SLR_plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "e2abe04eb35bee83f8eb82ef34ad35bb",
+ "grade": true,
+ "grade_id": "cell-4989d973853e512d",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "55d066dcc3c785ec147c6536046c0507",
+ "grade": false,
+ "grade_id": "cell-a628cff07e7aa9ce",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "Much better, isn't it? "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "01ce4ec9ae2e693f80c6850ff13ff62e",
+ "grade": false,
+ "grade_id": "cell-ed0a86b15f54a046",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### The model\n",
+ "\n",
+ "Let's review our model!\n",
+ "\n",
+ "The response: \n",
+ "\n",
+ "$$\n",
+ "Y_i =\n",
+ "\\begin{cases}\n",
+ "1 \\; \\; \\; \\; \\mbox{if the $i$th observation is a success},\\\\\n",
+ "0 \\; \\; \\; \\; \t\\mbox{otherwise}\n",
+ "\\end{cases}\n",
+ "$$\n",
+ "\n",
+ "can only take the values $0$ or $1$. \n",
+ "\n",
+ "The conditional expected value of this variable is the probability that $Y_i$ takes on the value of $1$, or probability of success, denoted as $p_i$. Hence:\n",
+ "\n",
+ "$$\\left.Y_i\\right|\\mathbf{X}_{i} \\sim \\text{Bernoulli}(p_i).$$\n",
+ "\n",
+ "The **logistic regression** models the probability $p_i$ given the information of a set of covariates but *not* directly as a linear function of them. Instead, it re-expresses $p_i$ on an unrestricted scale, called the **log-odds** :\n",
+ "\n",
+ "$$\n",
+ "\\mbox{logit}(p_i) = \\log \\bigg( \\frac{p_i}{1 - p_i}\\bigg) = \\beta_0 + \\beta_1 X_{i1} + \\beta_2 X_{i2} + \\ldots + \\beta_{q} X_{iq},\n",
+ "$$\n",
+ "\n",
+ "> the logarithm of the odds is the logarithm of the ratio of the probability of a sucess to the probability of a failure of an event\n",
+ "\n",
+ "Or equivalently\n",
+ "\n",
+ "$$\n",
+ "p_i = \\frac{\\exp\\big[\\beta_0 + \\beta_1 X_{i1} + \\beta_2 X_{i2} + \\ldots + \\beta_{q} X_{iq}\\big]}{1 + \\exp\\big[\\beta_0 + \\beta_1 X_{i1} + \\beta_2 X_{i2} + \\ldots + \\beta_{q} X_{iq}\\big]}.\n",
+ "$$\n",
+ "\n",
+ "**In our example**\n",
+ "\n",
+ "$$\n",
+ "\\mbox{logit}(p_{i,\\texttt{default}}) = \\log \\bigg( \\frac{p_{i,\\texttt{default}}}{1 - p_{i,\\texttt{default}}}\\bigg) = \\beta_0 + \\beta_1 X_{i,\\texttt{balance}}\n",
+ "$$\n",
+ "\n",
+ "or equivalently\n",
+ "\n",
+ "$$\n",
+ "p_{i,\\texttt{default}} = \\frac{e^{\\beta_0 + \\beta_1 X_{i,\\texttt{balance}}}}{1 + e^{\\beta_0 + \\beta_1 X_{i,\\texttt{balance}}}}.\n",
+ "$$\n",
+ "\n",
+ "In this example, the *odds* are interpreted as how likely the $i$th customer is to be in default compared to how unlikely it is, at a fixed value of their balance. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "478bbd3214d6111adc5062fe76be7d3b",
+ "grade": false,
+ "grade_id": "cell-f7510c5644481477",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### 1.1 Estimation\n",
+ "\n",
+ "The question now is, how do we estimate the coefficients $\\beta_j$'s? \n",
+ "\n",
+ "So far, in the case of linear regression, we have been using the Least Square Estimators. However, due to type of response of the logistic model this objective function is no longer appropriate.\n",
+ "\n",
+ "A common method to estimate a logistic regression (and many other models) is a method called **Maximum Likelihood Estimation**. Details of MLE are outside the scope of this course but we will still implement it using R! "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f61f4627cfa5bf01614da8ce6769a34c",
+ "grade": false,
+ "grade_id": "cell-9e11a13acd8b9ad1",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "\n",
+ "**Interpretation**\n",
+ "\n",
+ "As before, the interpretation of the coefficients will depend on the type of input variable. \n",
+ "\n",
+ "Let's start by looking at a model with only 1 categorical covariate. For example, being or not being a student.\n",
+ "\n",
+ "We know that in this case, `R` will create a dummy variable $X$ to include in the model. But how do we interpret its coefficient??\n",
+ "\n",
+ "> for LR, the intercept was the mean of the response of the reference group (and the *estimate* was the *sample* mean)\n",
+ "\n",
+ "> for LR, the \"slope\" was the *difference* between the mean of the response of the treatment group vs that of the reference group (and the *estimate* was the difference of *sample* means)\n",
+ "\n",
+ "But for LR, we modeled directly the expected value of the response. We now need to adjust the interpretations to changes in *log-odds*!!\n",
+ "\n",
+ "- Intercept: $\\hat{\\beta}_0$ represents the log-odds of the reference group (e.g., non-students)\n",
+ "\n",
+ "- Slope: $\\hat{\\beta}_1$ represents the difference in log-odds between the treatment and the reference group (e.g., students vs. non-students)\n",
+ "\n",
+ "Since log-odds are difficult to interpret, it is common to also interpret the exponentiated version of the coefficients:\n",
+ "\n",
+ "- Intercept: $e^{\\hat{\\beta}_0}$ represents the odds of the reference group, i.e., proportion of success relative to proportion of failures in the sample \n",
+ "\n",
+ "- Slope: $e^{\\hat{\\beta}_1}$ represents the *odds ratio*, i.e., ratio between the odds of the treatment vs the odds of the reference group\n",
+ "\n",
+ "*Run the cell below to compute these quantities. Read and follow calculations*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "default.df %>% dplyr::select(default, student) %>% \n",
+ " group_by(default, student)%>%\n",
+ " summarise(n=n(), .groups = 'drop')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "81016176e41809c5ea79cda11899053e",
+ "grade": false,
+ "grade_id": "cell-30b101cea83cb90f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# b0 = \\hat{beta}_0\n",
+ "tot_studentNo <- 6850 + 206\n",
+ "# probability of default if non-student\n",
+ "# note that the same is true for #default1_studentNo/#default0_studentNo\n",
+ "p_studentNo <- 206/tot_studentNo\n",
+ "b0 <- log(p_studentNo/(1-p_studentNo))\n",
+ "\n",
+ "# b1 = \\hat{beta}_1\n",
+ "tot_studentYes <- 2817 + 127\n",
+ "# probability of default if student\n",
+ "p_studentYes <- 127/tot_studentYes\n",
+ "# log odds of students vs. log odds of non-students\n",
+ "b1 <- log(p_studentYes/(1-p_studentYes)) - b0\n",
+ "\n",
+ "print(\"log-odds for non-students (b0) and for students\")\n",
+ "c(b0,log(p_studentYes/(1-p_studentYes))) %>% round(3)\n",
+ "\n",
+ "\n",
+ "print(\"odds for non-students (exp(b0)) and for students\")\n",
+ "c(exp(b0),(p_studentYes/(1-p_studentYes))) %>% round(3)\n",
+ "\n",
+ "\n",
+ "print(\"odds ratio (exp(b1))\")\n",
+ "c(exp(b1), \n",
+ " (p_studentYes/(1-p_studentYes))/(p_studentNo/(1-p_studentNo))) %>% \n",
+ " round(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "5e2a17e94044cbaf3b4874c5b156e9f7",
+ "grade": false,
+ "grade_id": "cell-a3e8e0c543a5d4ea",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.5:**\n",
+ "
{points: 1}\n",
+ "\n",
+ "In order to fit the model, we can use the function `glm()` and its argument `family = binomial` (required to specify the binary nature of the response). \n",
+ "\n",
+ "Let us use the function `glm()` to estimate a binary logistic regression. Using `default.df`, we will fit a binary logistic model with `default` as the response and `student` as input variable.\n",
+ " \n",
+ "Store the model in an object named `Default_binary_log_student`. The `glm()` parameters are analogous to `lm()` (`formula` and `data`) with the addition of `family = binomial` for this specific model. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "73c8f88203971ea30b4fdffd5a7bcd8e",
+ "grade": false,
+ "grade_id": "cell-8ac0e64c9daf4ffc",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Default_binary_log_student <- \n",
+ "# ...(\n",
+ "# formula = ...,\n",
+ "# data = ...,\n",
+ "# family = ...)\n",
+ "\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "summary(Default_binary_log_student)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9f286a3c5b2ad6c51b25dc02ee65f1ac",
+ "grade": true,
+ "grade_id": "cell-debd081e1606b49a",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.5()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "04b74bd9ec3596fb96c3024af41b5e06",
+ "grade": false,
+ "grade_id": "cell-de489b368280eba8",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "Note that you can also use the function `tidy()` to obtain a summary table of results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ac8e6a193d25e0a49d57dbfa9ae4c707",
+ "grade": false,
+ "grade_id": "cell-68c04287f7d21da3",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "#Run this cell to get a tidy summary table\n",
+ "\n",
+ "Default_binary_log_student_results <-\n",
+ " tidy(Default_binary_log_student) %>% \n",
+ " mutate(exp.estimate = exp(estimate)) %>% \n",
+ " mutate_if(is.numeric, round, 3)\n",
+ "\n",
+ "Default_binary_log_student_results "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Note**: you can also get the exponentiated coefficients with the argument `exponentiate = TRUE` in `tidy()`. Note that the SEs are adjusted as well so this would be a better approach!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "tidy(Default_binary_log_student, exponentiate = TRUE)%>% \n",
+ " mutate_if(is.numeric, round, 3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d138132eae20c50ea25285328a5e0a2f",
+ "grade": false,
+ "grade_id": "cell-6665a0ff5a730bdf",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.6**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Considering the `Default_binary_log_student_results` tibble, what is the correct interpretation of the $\\hat{\\beta}_\\textit{student}$?\n",
+ "\n",
+ "**A.** The odds of default are $49.9\\%$ higher for non-student custormers compared to student customers.\n",
+ "\n",
+ "**B.** The odds of default are $49.9\\%$ higher for student custormers compared to non-student customers.\n",
+ "\n",
+ "**C.** The odds of default are $40.5\\%$ higher for non-student custormers compared to student customers.\n",
+ "\n",
+ "**D.** The odds of default are $40.5\\%$ higher for student custormers compared to non-student customers.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.6` (character type surrounded by quotes).*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5061c90ecf4e6a196e542aea2f516667",
+ "grade": false,
+ "grade_id": "cell-9d862eda2ad912b4",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.6 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "48a900d29872eb46d146064466276e40",
+ "grade": true,
+ "grade_id": "cell-454d0bf3ba416081",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.6()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "de9f91a15d444e3d56b341b6d427ec1b",
+ "grade": false,
+ "grade_id": "cell-4bf8d13d2489425d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### Multiple covariates\n",
+ "\n",
+ "And the interpretation extends to the case of more covariates of different types\n",
+ "\n",
+ "- $\\hat{\\beta}_1$ gives the changes in log odds per one-unit increase in X, \n",
+ " - or equivalently it multiplies the odds by $e^{\\hat{\\beta}_1}$\n",
+ " \n",
+ "> **Note**: because the relationship between $p_i$ and $X_i$ is not a straight line, $\\hat{\\beta}_1$ does not correspond to the change in $p_i$ associated with a one-unit change in $X_i$. But regardless,\n",
+ "if $\\hat{\\beta}_1$ is positive then increasing $X_i$ will be associated with increasing $p_i$ and visceversa.\n",
+ "\n",
+ "> **Note**: when the odds decreases, i.e., negative change in log-odds, it is easier to interpret $1/e^{\\hat{\\beta}_1}$. That is, to redefine odds= #failures/#successes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "0804ff676094f93274dcd376ffabea74",
+ "grade": false,
+ "grade_id": "cell-148b1dc1b2667581",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.7:**\n",
+ "
{points: 1}\n",
+ "\n",
+ "In order to fit the model, we can use the function `glm()` and its argument `family = binomial` (required to specify the binary nature of the response). \n",
+ "\n",
+ "Let us use the function `glm()` to estimate a binary logistic regression. Using `default.df`, we will fit a binary logistic model with `default` as the response and `student`, `balance`, and `income` as input variables.\n",
+ " \n",
+ "Store the model in an object named `Default_binary_log_model`. The `glm()` parameters are analogous to `lm()` (`formula` and `data`) with the addition of `family = binomial` for this specific model. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "7fc03bcc53654a144e281aa0f3c33b70",
+ "grade": false,
+ "grade_id": "cell-7097573be6a38bf2",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Default_binary_log_model <- \n",
+ "# ...(\n",
+ "# ...,\n",
+ "# ...,\n",
+ "# ...)\n",
+ "\n",
+ "#tidy(Default_binary_log_model) %>% \n",
+ "# mutate_if(is.numeric, round, 3)\n",
+ "#tidy(Default_binary_log_model, exponentiate = TRUE) %>% \n",
+ "# mutate_if(is.numeric, round, 3)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "tidy(Default_binary_log_model) %>% \n",
+ " mutate_if(is.numeric, round, 3)\n",
+ "tidy(Default_binary_log_model, exponentiate = TRUE) %>% \n",
+ " mutate_if(is.numeric, round, 3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b186bf9f116ab25d76f66c78304a40df",
+ "grade": true,
+ "grade_id": "cell-a0e923b260f01656",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.7()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "2054c9f83c1326c5421a8da34c840a94",
+ "grade": false,
+ "grade_id": "cell-5fe352592ed23811",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.8**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Considering the `Default_binary_log_model_results` tibble, what is the correct interpretation of the $\\hat{\\beta}_\\textit{student}$?\n",
+ "\n",
+ "**A.** Since, $1 / 0.524 = 1.908$, we estimate that the odds of non-default are $90.8\\%$ higher for non-student custormers compared to student customers, while keeping the rest of the input variables constant.\n",
+ "\n",
+ "**B.** Since, $1 / 0.524 = 1.908$, we estimate that the odds of default are $90.8\\%$ higher for non-student custormers compared to student customers, while keeping the rest of the input variables constant.\n",
+ "\n",
+ "**C.** Since, $1 / 0.524 = 1.908$, we estimate that the odds of non-default are $90.8\\%$ higher for student custormers compared to non-student customers, while keeping the rest of the input variables constant.\n",
+ "\n",
+ "**D.** Since, $1 / 0.524 = 1.908$, we estimate that the odds of default are $90.8\\%$ higher for student custormers compared to non-student customers, while keeping the rest of the input variables constant.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.8` (character type surrounded by quotes).*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a52e8dcb232f6c3226e33b6217364aed",
+ "grade": false,
+ "grade_id": "cell-11677807d101b133",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.8 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f8b8b2dbc78416121455ee492281fa03",
+ "grade": true,
+ "grade_id": "cell-30f034478ea08799",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.8()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "66633e2f35bab1d5f2a72e59e1348e6d",
+ "grade": false,
+ "grade_id": "cell-7960f6f613499491",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.9**\n",
+ "
{points: 1}\n",
+ "\n",
+ "What is the correct interpretation of the regression equation's estimated slope for `balance`?\n",
+ "\n",
+ "**A.** A $\\$1$ increase in `balance`, is associated with a $0.6\\%$ increase in the odds of a customer to be in default, while keeping the rest of the input variables constant. \n",
+ "\n",
+ "**B.** A $\\$1$ increase in `balance`, is associated with a $0.6\\%$ increase in the odds of a customer to not be in default, while keeping the rest of the input variables constant. \n",
+ "\n",
+ "**C.** A $\\$1$ increase in `balance`, is associated with a $0.6\\%$ increase in the odds of a student customer to be in default, while keeping the rest of the input variables constant. \n",
+ "\n",
+ "**D.** A $\\$1$ increase in `balance`, is associated with a $0.6\\%$ increase in the odds of a non-student customer to be in default, while keeping the rest of the input variables constant.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.9` (character type surrounded by quotes).*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "19623ffd8cd12291e421b7f8a0cc12cb",
+ "grade": false,
+ "grade_id": "cell-7441e6aa1307023f",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.9 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4809d604ee9b9b23b53ce01c29a0e108",
+ "grade": true,
+ "grade_id": "cell-347ad929fad50479",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.9()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a7dd6ecd9ec8878252d59647e82b6d6a",
+ "grade": false,
+ "grade_id": "cell-8357d4c87f849b10",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### 1.2 Inference\n",
+ "\n",
+ "We can use this estimated model to make inference about the population parameters, i.e., we can determine whether an input variable is statistically associated with the logarithm of the odds through hypothesis testing for the parameters $\\beta_j$. \n",
+ "\n",
+ "To do that, we need additional information about the estimators of the regression coefficients, $\\hat{\\beta}_j$. In particular, we need their sampling distribution and corresponding standard errors, $\\mbox{SE}\\left(\\hat{\\beta}_j\\right)$. \n",
+ "\n",
+ "**Theoretical test**:\n",
+ "\n",
+ "To test the hypotheses\n",
+ "\\begin{gather*}\n",
+ "H_0: \\beta_j = 0\\\\\n",
+ "H_a: \\beta_j \\neq 0.\n",
+ "\\end{gather*}\n",
+ "\n",
+ "you can use the Wald statistic $z_j$\n",
+ "\n",
+ "$$z_j = \\frac{\\hat{\\beta}_j}{\\mbox{SE}\\left(\\hat{\\beta}_j\\right)}$$\n",
+ "\n",
+ "which under $H_0$ has an approximately standard normal distribution provided the sample size $n$ is large enough.\n",
+ "\n",
+ "> this statistic is analogous to the $t$-value used in LR. \n",
+ "\n",
+ "Furthermore, given a specified level of confidence, we can construct approximate $(1 - \\alpha) \\times 100\\%$ confidence intervals for the corresponding true value of $\\beta_j$:\n",
+ "\n",
+ "$$\\hat{\\beta}_j \\pm z_{\\alpha/2}\\mbox{SE}\\left(\\hat{\\beta}_j\\right),$$\n",
+ "\n",
+ "where $z_{\\alpha/2}$ is the upper $\\alpha/2$ quantile of the standard normal distribution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d2bcf7253b1378ab67cead3e877bb314",
+ "grade": false,
+ "grade_id": "cell-c18026a5fd84fca7",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.10: Inference**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Report the estimated coefficients, their standard errors, and corresponding $p$-values by calling `tidy()` on `Default_binary_log_model`. Include the corresponding asymptotic 95% confidence intervals. \n",
+ "\n",
+ "_Store the results in the variable `Default_binary_log_model_results`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b72573f0960e69b8e55c9ac8b373b79a",
+ "grade": false,
+ "grade_id": "cell-ad37611aa165ca7a",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Default_binary_log_model_results <- \n",
+ "# ...(..., conf.int = TRUE) \n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "Default_binary_log_model_results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "92a3208529cd7c7a0d2ccc4fdaf2e45f",
+ "grade": true,
+ "grade_id": "cell-38f7c2eea95906c9",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.10()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "34f7ea6dd9dba6baa48fb9550c013c93",
+ "grade": false,
+ "grade_id": "cell-82749136a082e937",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.11: Inference**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Use `tidy()` to the estimated effect each of the variables has on the **odds** to the `Default_binary_log_model_results` tibble. Make sure to also include the confidence interval for these effects."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "1c4cbbede9240f2a0d3392e18fb5cf5e",
+ "grade": false,
+ "grade_id": "cell-366329ae6e91a284",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Default_binary_odds_model_results <- \n",
+ "# ... %>%\n",
+ "# mutate_if(is.numeric, round, 6)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "Default_binary_odds_model_results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "505b346f8721786c9ec10aa064641509",
+ "grade": true,
+ "grade_id": "cell-b905bb95b0e06997",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_1.11()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f95edcda4f00e0f3fc5b6b692dfed17f",
+ "grade": false,
+ "grade_id": "cell-d9141f6447689b01",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.12**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Using a **significance level $\\alpha = 0.05$**, which inputs are statistically associated to the probability of default in `Default_binary_log_model_results`?\n",
+ "\n",
+ "**A.** The categorical input `student`.\n",
+ "\n",
+ "**B.** The continuous input `balance`.\n",
+ "\n",
+ "**C.** The continuous input `income`.\n",
+ "\n",
+ "*Assign your answers to the object `answer1.12`. Your answers have to be included in a single string indicating the correct options **in alphabetical order** and surrounded by quotes (e.g., `\"ABC\"` indicates you are selecting the three options).*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "1921a0f4b26d34ce61fe714041abb812",
+ "grade": false,
+ "grade_id": "cell-822cb96c53313b4e",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.12 <- \n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "535f53f696cf6606d87d1581eff0df56",
+ "grade": true,
+ "grade_id": "cell-a04e784c42d742dc",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.12()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "2ee271aada5209b247a9c9de38590baf",
+ "grade": false,
+ "grade_id": "cell-a1792c454ca79341",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### 1.3 Prediction\n",
+ "\n",
+ "Besides inference, we can use an estimated logistic regression model to predict the probability of success. \n",
+ "\n",
+ "\\begin{gather*} \n",
+ "\\log \\bigg( \\frac{\\hat{p}_i}{1 - \\hat{p}_i}\\bigg) = \\hat{\\beta}_0 +\\hat{\\beta}_1 x_{i1} + \\ldots + \\hat{\\beta}_p x_{iq} \\\\\n",
+ "\\end{gather*}\n",
+ "\n",
+ "where lower letters $x$ were used to denote a particular observed value for the $i$th experimental unit.\n",
+ "\n",
+ "**For example:**\n",
+ "\n",
+ "Suppose we want to predict the odds of a student who has a credit card balance of \\\\$2200 and an income of \\\\$35000 to be in default relative to not being in default.\n",
+ "\n",
+ "Mathematically (omitting $i$th for simplicity), our predicted log odds will be \n",
+ "\n",
+ "\\begin{gather*} \n",
+ "\\log \\bigg( \\frac{\\hat{p}_\\texttt{default}}{1 - \\hat{p}_\\texttt{default}}\\bigg) = \\underbrace{-10.869045}_{\\hat{\\beta}_0} - \\underbrace{0.646776}_{\\hat{\\beta}_1} \\times (1) + \\underbrace{0.005737}_{\\hat{\\beta}_2} \\times (2200) + \\underbrace{0.000003}_{\\hat{\\beta}_2} \\times (35000)= 1.21 \\\\\n",
+ "\\end{gather*}\n",
+ "\n",
+ "Next, by taking the exponential on both sides of the equation, we obtain our predicted *odds*: \n",
+ "\n",
+ "$$\n",
+ "\\frac{\\hat{p}_\\texttt{default}}{1 - \\hat{p}_\\texttt{default}} = e^{1.21} = 3.36.\n",
+ "$$\n",
+ "\n",
+ "Finally, solving the above for $\\hat{p}_\\texttt{default}$, we obtain our predicted probability of default\n",
+ "\n",
+ "$$\n",
+ "\\hat{p}_\\texttt{default} = 3.36/4.36 = 0.7706\n",
+ "$$"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f4143eda53199547d5343caf421d55c4",
+ "grade": false,
+ "grade_id": "cell-cfbfd19ea63f8152",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.13**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Using `predict` and `Default_binary_log_model`, obtain the odds prediction above.\n",
+ "\n",
+ "> **Hint:** Check the argument `type` when coding this prediction.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.13`. Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f7aaf749f2d98f24a73849cc5ef62f72",
+ "grade": false,
+ "grade_id": "cell-23f0d706c79c9f1c",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.13 <- exp(...(...,\n",
+ "# tibble(..., ..., ...),\n",
+ "# type = ...))\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.13"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "dfa62c02c59648c0455073c973242fe7",
+ "grade": true,
+ "grade_id": "cell-e8a22ef2e4ac8fe6",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.13()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e4e818ff3ea708b153256ef71333c4e8",
+ "grade": false,
+ "grade_id": "cell-f84da54cd4da6b01",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.14**\n",
+ "
{points: 1}\n",
+ "\n",
+ "We can also predict probabilities for classification purposes, i.e., whether the customer will default or not. Using the function `predict()` with the object `Default_binary_log_model`, obtain the estimated probability for a customer being in default. This customer is a `student` who has a credit card `balance` of `2200` with an income of `35000`.\n",
+ "\n",
+ "> **Hint:** Check the argument `type` when coding this prediction.\n",
+ "\n",
+ "*Assign your answer to the object `answer1.14`. Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4ce8362e7211329066581c675826eb3d",
+ "grade": false,
+ "grade_id": "cell-ee0c66cf674510cc",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.14 <- \n",
+ "# ...(...,\n",
+ "# tibble(..., ..., ...),\n",
+ "# type = ...)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "answer1.14"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6c96e6a22f12b6ca559b6781d603b09f",
+ "grade": true,
+ "grade_id": "cell-03c8e23b29226318",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.14()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "423fa69640684569da769ed243209f7a",
+ "grade": false,
+ "grade_id": "cell-5d4b54303734b4c1",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Overdispersion**\n",
+ "\n",
+ "The variance of a binary response variable is a function of the mean: $p(1-p)$. What this means is that, the estimate of the mean also provides an estimate of the variance of the response. \n",
+ "\n",
+ "Since the logistic regression is built assuming that the response is *Bernoulli*, the estimated $\\hat{p}$ conditions the estimated variance of the response to be $\\hat{p}(1-\\hat{p})$. \n",
+ "\n",
+ "Unfortunately, in real applications, even in situation where the model seems to be estimating the mean well, the variability of the data is not quite compatible with the model's assumed variance.\n",
+ "\n",
+ "> this misspecification in the variance affects the SE of the coefficients, not their estimates.\n",
+ "\n",
+ "A way around this problem is to estimate a dispersion parameter, usually called $\\phi$, to correct the standard error of our estimators. An easy implementation is to change the `family` argument to a `quasibinomial`. Let's see an example. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8add326ccea85b3907d01445cdf32c31",
+ "grade": false,
+ "grade_id": "cell-5d6361d8c9695383",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "summary(glm(\n",
+ " formula = default ~ student + balance + income,\n",
+ " data = Default,\n",
+ " family = quasibinomial))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "030f81bd873d8a52465cf2f85955b749",
+ "grade": false,
+ "grade_id": "cell-69697cc0a73122ef",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "source": [
+ "**Note** that the estimates haven't changed but the SE were adjusted to account for some overdispersion in the data.\n",
+ "\n",
+ "### 1.4 Conclusions\n",
+ "\n",
+ "- The (conditional) expectation of a binary response is the probability of success.\n",
+ "\n",
+ "- A LR can not be used to model the conditional expectation of a binary response since its range extends beyond the interval $[0,1]$\n",
+ "\n",
+ "- Instead, one can model a function of the conditional probability. A common choice in logistic regression is to use the *logit* function (logarithm of odds)\n",
+ "\n",
+ "- The interpretation of the coefficients depends on the type of variables and the form of the model:\n",
+ "\n",
+ "The raw coefficients are interpreted as:\n",
+ "\n",
+ "- log-odds of a reference group\n",
+ "- difference of log-odds of a treatment vs a control group\n",
+ "- changes in log-odds per unit change in the input\n",
+ " \n",
+ "The exponentiated coefficients are interpreted as:\n",
+ "- odds of a reference group\n",
+ "- odds ratio of a treatment vs a control group\n",
+ "- multiplicative changes in odds per unit change in the input\n",
+ " \n",
+ "- The estimated logistic model can be used to make inference using the Wald's test\n",
+ "\n",
+ "- The estimated logistic model can be used to make predictions\n",
+ " - the probability of success\n",
+ " - the odds of success relative to failure"
+ ]
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "formats": "ipynb,Rmd"
+ },
+ "kernelspec": {
+ "display_name": "R",
+ "language": "R",
+ "name": "ir"
+ },
+ "language_info": {
+ "codemirror_mode": "r",
+ "file_extension": ".r",
+ "mimetype": "text/x-r-source",
+ "name": "R",
+ "pygments_lexer": "r",
+ "version": "4.2.3"
+ },
+ "latex_envs": {
+ "LaTeX_envs_menu_present": true,
+ "autoclose": false,
+ "autocomplete": true,
+ "bibliofile": "biblio.bib",
+ "cite_by": "apalike",
+ "current_citInitial": 1,
+ "eqLabelWithNumbers": true,
+ "eqNumInitial": 1,
+ "hotkeys": {
+ "equation": "Ctrl-E",
+ "itemize": "Ctrl-I"
+ },
+ "labels_anchors": false,
+ "latex_user_defs": false,
+ "report_style_numbering": false,
+ "user_envs_cfg": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}