From ad07ec576adb8890d52615d3073a2de8eba883a8 Mon Sep 17 00:00:00 2001 From: Trevor Campbell Date: Mon, 13 Nov 2023 15:54:23 -0800 Subject: [PATCH 1/7] empty commit From 154a7fb1bce6ba226b2375894f7587a865a7a1c4 Mon Sep 17 00:00:00 2001 From: Trevor Campbell Date: Mon, 13 Nov 2023 17:11:04 -0800 Subject: [PATCH 2/7] evaluating on the test set in clsfcn2 --- source/classification2.Rmd | 96 ++++++++++++++++++++++++++++++++++---- 1 file changed, 87 insertions(+), 9 deletions(-) diff --git a/source/classification2.Rmd b/source/classification2.Rmd index a991fcc0d..a81ba9288 100644 --- a/source/classification2.Rmd +++ b/source/classification2.Rmd @@ -491,7 +491,7 @@ cancer_test_predictions <- predict(knn_fit, cancer_test) |> cancer_test_predictions ``` -### Evaluate performance +### Evaluate performance {#eval-performance-cls2} Finally, we can assess our classifier's performance. First, we will examine accuracy. To do this we use the @@ -941,14 +941,29 @@ accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) + accuracy_vs_k ``` +We can also obtain the number of neighbours with the highest accuracy +programmatically by accessing the `neighbors` variable in the `accuracies` data +frame where the `mean` variable is highest. +Note that it is still useful to visualize the results as +we did above since this provides additional information on how the model +performance varies. + +```{r 06-extract-k} +best_k <- accuracies |> + arrange(desc(mean)) |> + head(1) |> + pull(neighbors) +best_k +``` + Setting the number of -neighbors to $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` +neighbors to $K =$ `r best_k` provides the highest accuracy (`r (accuracies |> arrange(desc(mean)) |> slice(1) |> pull(mean) |> round(4))*100`%). But there is no exact or perfect answer here; any selection from $K = 30$ and $60$ would be reasonably justified, as all of these differ in classifier accuracy by a small amount. Remember: the values you see on this plot are *estimates* of the true accuracy of our classifier. Although the -$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` value is +$K =$ `r best_k` value is higher than the others on this plot, that doesn't mean the classifier is actually more accurate with this parameter value! Generally, when selecting $K$ (and other parameters for other predictive @@ -958,12 +973,12 @@ models), we are looking for a value where: - changing the value to a nearby one (e.g., adding or subtracting a small number) doesn't decrease accuracy too much, so that our choice is reliable in the presence of uncertainty; - the cost of training the model is not prohibitive (e.g., in our situation, if $K$ is too large, predicting becomes expensive!). -We know that $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` +We know that $K =$ `r best_k` provides the highest estimated accuracy. Further, Figure \@ref(fig:06-find-k) shows that the estimated accuracy -changes by only a small amount if we increase or decrease $K$ near $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors`. -And finally, $K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` does not create a prohibitively expensive +changes by only a small amount if we increase or decrease $K$ near $K =$ `r best_k`. +And finally, $K =$ `r best_k` does not create a prohibitively expensive computational cost of training. Considering these three points, we would indeed select -$K =$ `r (accuracies |> arrange(desc(mean)) |> head(1))$neighbors` for the classifier. +$K =$ `r best_k` for the classifier. ### Under/Overfitting @@ -987,10 +1002,10 @@ knn_results <- workflow() |> tune_grid(resamples = cancer_vfold, grid = k_lots) |> collect_metrics() -accuracies <- knn_results |> +accuracies_lots <- knn_results |> filter(.metric == "accuracy") -accuracy_vs_k_lots <- ggplot(accuracies, aes(x = neighbors, y = mean)) + +accuracy_vs_k_lots <- ggplot(accuracies_lots, aes(x = neighbors, y = mean)) + geom_point() + geom_line() + labs(x = "Neighbors", y = "Accuracy Estimate") + @@ -1082,6 +1097,69 @@ a balance between the two. You can see these two effects in Figure \@ref(fig:06-decision-grid-K), which shows how the classifier changes as we set the number of neighbors $K$ to 1, 7, 20, and 300. +### Evaluating on the test set + +Now that we have tuned the KNN classifier and set $K =$ `r best_k`, +we are done building the model and it is time to evaluate the quality of its predictions on the held out +test data, as we did earlier in Section \@ref(eval-performance-cls2). +We first need to retrain the KNN classifier +on the entire training data set using the selected number of neighbors. + +```{r 06-eval-on-test-set-after-tuning, message = FALSE, warning = FALSE} +cancer_recipe <- recipe(Class ~ Smoothness + Concavity, data = cancer_train) |> + step_scale(all_predictors()) |> + step_center(all_predictors()) + +knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = best_k) |> + set_engine("kknn") |> + set_mode("classification") + +knn_fit <- workflow() |> + add_recipe(cancer_recipe) |> + add_model(knn_spec) |> + fit(data = cancer_train) + +knn_fit +``` + +Then to make predictions and assess the estimated accuracy of the best model on the test data, we use the +`predict` and `conf_mat` functions as we did earlier in this chapter. + +```{r 06-predictions-after-tuning, message = FALSE, warning = FALSE} +cancer_test_predictions <- predict(knn_fit, cancer_test) |> + bind_cols(cancer_test) + +cancer_test_predictions |> + metrics(truth = Class, estimate = .pred_class) |> + filter(.metric == "accuracy") +``` + +```{r 06-predictions-after-tuning-acc-save-hidden, echo = FALSE, message = FALSE, warning = FALSE} +cancer_acc_tuned <- cancer_test_predictions |> + metrics(truth = Class, estimate = .pred_class) |> + filter(.metric == "accuracy") |> + pull(.estimate) +``` + +```{r 06-confusion-matrix-after-tuning, message = FALSE, warning = FALSE} +confusion <- cancer_test_predictions |> + conf_mat(truth = Class, estimate = .pred_class) +confusion +``` + +At first glance, this is a bit surprising: the performance of the classifier +has not changed much despite tuning the number of neighbors! For example, our first model +with $K =$ 3 (before we knew how to tune) had an estimated accuracy of `r round(100*cancer_acc_1$.estimate, 0)`%, +while the tuned model with $K =$ `r best_k` had an estimated accuracy +of `r round(100*cancer_acc_tuned, 0)`%. +But upon examining Figure \@ref(fig:06-find-k) again closely—to revisit the +cross validation accuracy estimates for a range of neighbors—this result +becomes much less surprising. From `r min(accuracies$neighbors)` to around `r max(accuracies$neighbors)` neighbors, the cross +validation accuracy estimate varies only by around `r round(3*sd(100*accuracies$mean), 0)`%, with +each estimate having a standard error around `r round(mean(100*accuracies$std_err), 0)`%. +Since the cross-validation accuracy estimates the test set accuracy, +the fact that the test set accuracy also doesn't change much is expected. + ## Summary Classification algorithms use one or more quantitative variables to predict the From 0ee258a2e71936a788cb568748d28ac14614c08f Mon Sep 17 00:00:00 2001 From: Trevor Campbell Date: Mon, 13 Nov 2023 17:53:23 -0800 Subject: [PATCH 3/7] fixing inconsistent train/test split in reg1,2 --- source/regression1.Rmd | 5 +++++ source/regression2.Rmd | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/source/regression1.Rmd b/source/regression1.Rmd index 645c31535..64d219915 100644 --- a/source/regression1.Rmd +++ b/source/regression1.Rmd @@ -305,6 +305,11 @@ that we used earlier in the chapter (Figure \@ref(fig:07-small-eda-regr)). \index{training data} \index{test data} +```{r 07-sacramento-seed-before-train-test-split, echo = FALSE, message = FALSE, warning = FALSE} +# hidden seed -- make sure this is the same as what appears in reg2 right before train/test split +set.seed(10) +``` + ```{r 07-test-train-split} sacramento_split <- initial_split(sacramento, prop = 0.75, strata = price) sacramento_train <- training(sacramento_split) diff --git a/source/regression2.Rmd b/source/regression2.Rmd index 02ab52058..970399741 100644 --- a/source/regression2.Rmd +++ b/source/regression2.Rmd @@ -221,11 +221,11 @@ can come back to after we choose our final model. Let's take care of that now. library(tidyverse) library(tidymodels) -set.seed(1234) +set.seed(10) sacramento <- read_csv("data/sacramento.csv") -sacramento_split <- initial_split(sacramento, prop = 0.6, strata = price) +sacramento_split <- initial_split(sacramento, prop = 0.75, strata = price) sacramento_train <- training(sacramento_split) sacramento_test <- testing(sacramento_split) ``` @@ -349,7 +349,8 @@ obtained from the same problem, shown in Figure \@ref(fig:08-compareRegression). ```{r 08-compareRegression, echo = FALSE, warning = FALSE, message = FALSE, fig.height = 4.75, fig.width = 10, fig.cap = "Comparison of simple linear regression and KNN regression."} set.seed(1234) -sacr_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 30) |> +# neighbors = 28 from regression1 chapter +sacr_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 28) |> set_engine("kknn") |> set_mode("regression") From 20301d82e2c1dac4f505a77fd75884ce9ee91850 Mon Sep 17 00:00:00 2001 From: Trevor Campbell Date: Mon, 13 Nov 2023 18:12:43 -0800 Subject: [PATCH 4/7] seed hacking to get reg1 and reg2 story to align with py --- source/regression1.Rmd | 6 +++--- source/regression2.Rmd | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/source/regression1.Rmd b/source/regression1.Rmd index 64d219915..42a4a1e15 100644 --- a/source/regression1.Rmd +++ b/source/regression1.Rmd @@ -307,7 +307,7 @@ that we used earlier in the chapter (Figure \@ref(fig:07-small-eda-regr)). ```{r 07-sacramento-seed-before-train-test-split, echo = FALSE, message = FALSE, warning = FALSE} # hidden seed -- make sure this is the same as what appears in reg2 right before train/test split -set.seed(10) +set.seed(7) ``` ```{r 07-test-train-split} @@ -512,13 +512,13 @@ Figure \@ref(fig:07-choose-k-knn-plot). What is happening here? Figure \@ref(fig:07-howK) visualizes the effect of different settings of $K$ on the regression model. Each plot shows the predicted values for house sale price from -our KNN regression model on the training data for 6 different values for $K$: 1, 3, `r kmin`, 41, 250, and 680 (almost the entire training set). +our KNN regression model on the training data for 6 different values for $K$: 1, 3, 25, `r kmin`, 250, and 680 (almost the entire training set). For each model, we predict prices for the range of possible home sizes we observed in the data set (here 500 to 5,000 square feet) and we plot the predicted prices as a blue line. ```{r 07-howK, echo = FALSE, warning = FALSE, fig.height = 13, fig.width = 10,fig.cap = "Predicted values for house price (represented as a blue line) from KNN regression models for six different values for $K$."} -gridvals <- c(1, 3, kmin, 41, 250, 680) +gridvals <- c(1, 3, 25, kmin, 250, 680) plots <- list() diff --git a/source/regression2.Rmd b/source/regression2.Rmd index 970399741..2b3bd668b 100644 --- a/source/regression2.Rmd +++ b/source/regression2.Rmd @@ -221,7 +221,7 @@ can come back to after we choose our final model. Let's take care of that now. library(tidyverse) library(tidymodels) -set.seed(10) +set.seed(7) sacramento <- read_csv("data/sacramento.csv") @@ -350,7 +350,7 @@ obtained from the same problem, shown in Figure \@ref(fig:08-compareRegression). ```{r 08-compareRegression, echo = FALSE, warning = FALSE, message = FALSE, fig.height = 4.75, fig.width = 10, fig.cap = "Comparison of simple linear regression and KNN regression."} set.seed(1234) # neighbors = 28 from regression1 chapter -sacr_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 28) |> +sacr_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 52) |> set_engine("kknn") |> set_mode("regression") @@ -621,10 +621,9 @@ indicating that we should likely choose linear regression for predictions of house sale price on this data set. Revisiting the simple linear regression model with only a single predictor from earlier in this chapter, we see that the RMSPE for that model was \$`r format(lm_test_results |> filter(.metric == 'rmse') |> pull(.estimate), big.mark=",", nsmall=0, scientific = FALSE)`, -which is slightly higher than that of our more complex model. Our model with two predictors -provided a slightly better fit on test data than our model with just one. -As mentioned earlier, this is not always the case: sometimes including more -predictors can negatively impact the prediction performance on unseen +which is almost the same as that of our more complex model. +As mentioned earlier, this is not always the case: often including more +predictors will either positively or negatively impact the prediction performance on unseen test data. ## Multicollinearity and outliers From bc51506edbc0671f04ea691633d8f69aa9e904e1 Mon Sep 17 00:00:00 2001 From: Trevor Campbell Date: Tue, 14 Nov 2023 18:21:33 -0800 Subject: [PATCH 5/7] more discussion of prec/rec; robustifying the cv5 vs 10 --- source/classification2.Rmd | 131 ++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 40 deletions(-) diff --git a/source/classification2.Rmd b/source/classification2.Rmd index a81ba9288..fc2f7c8d7 100644 --- a/source/classification2.Rmd +++ b/source/classification2.Rmd @@ -383,7 +383,7 @@ seed earlier in the chapter, the split will be reproducible. ```{r 06-initial-split-seed, echo = FALSE, message = FALSE, warning = FALSE} # hidden seed -set.seed(1) +set.seed(2) ``` ```{r 06-initial-split} @@ -495,7 +495,7 @@ cancer_test_predictions Finally, we can assess our classifier's performance. First, we will examine accuracy. To do this we use the -`metrics` function \index{tidymodels!metrics} from `tidymodels`, +`metrics` function \index{tidymodels!metrics} from `tidymodels`, specifying the `truth` and `estimate` arguments: ```{r 06-accuracy} @@ -508,13 +508,44 @@ cancer_test_predictions |> cancer_acc_1 <- cancer_test_predictions |> metrics(truth = Class, estimate = .pred_class) |> filter(.metric == 'accuracy') + +cancer_prec_1 <- cancer_test_predictions |> + precision(truth = Class, estimate = .pred_class, event_level="first") + +cancer_rec_1 <- cancer_test_predictions |> + recall(truth = Class, estimate = .pred_class, event_level="first") ``` -In the metrics data frame, we filtered the `.metric` column since we are +In the metrics data frame, we filtered the `.metric` column since we are interested in the `accuracy` row. Other entries involve other metrics that are beyond the scope of this book. Looking at the value of the `.estimate` variable - shows that the estimated accuracy of the classifier on the test data -was `r round(100*cancer_acc_1$.estimate, 0)`%. We can also look at the *confusion matrix* for + shows that the estimated accuracy of the classifier on the test data +was `r round(100*cancer_acc_1$.estimate, 0)`%. +To compute the precision and recall, we can use the `precision` and `recall` functions +from `tidymodels`. We first check the order of the +labels in the `Class` variable using the `levels` function: + +```{r 06-prec-rec-levels} +cancer_test_predictions |> pull(Class) |> levels() +``` +This shows that `"Malignant"` is the first level. Therefore we will set +the `truth` and `estimate` arguments to `Class` and `.pred_class` as before, +but also specify that the "positive" class corresponds to the first factor level via `event_level="first"`. +If the labels were in the other order, we would instead use `event_level="second"`. + +```{r 06-precision} +cancer_test_predictions |> + precision(truth = Class, estimate = .pred_class, event_level="first") +``` + +```{r 06-recall} +cancer_test_predictions |> + recall(truth = Class, estimate = .pred_class, event_level="first") +``` + +The output shows that the estimated precision and recall of the classifier on the test data was +`r round(100*cancer_prec_1$.estimate, 0)`% and `r round(100*cancer_rec_1$.estimate, 0)`%, respectively. +Finally, we can look at the *confusion matrix* for the classifier using the `conf_mat` function. ```{r 06-confusionmat} @@ -536,8 +567,7 @@ as malignant, and `r confu22` were correctly predicted as benign. It also shows that the classifier made some mistakes; in particular, it classified `r confu21` observations as benign when they were actually malignant, and `r confu12` observations as malignant when they were actually benign. -Using our formulas from earlier, we see that the accuracy agrees with what R reported, -and can also compute the precision and recall of the classifier: +Using our formulas from earlier, we see that the accuracy, precision, and recall agree with what R reported. $$\mathrm{accuracy} = \frac{\mathrm{number \; of \; correct \; predictions}}{\mathrm{total \; number \; of \; predictions}} = \frac{`r confu11`+`r confu22`}{`r confu11`+`r confu22`+`r confu12`+`r confu21`} = `r round((confu11+confu22)/(confu11+confu22+confu12+confu21),3)`$$ @@ -548,11 +578,11 @@ $$\mathrm{recall} = \frac{\mathrm{number \; of \; correct \; positive \; predi ### Critically analyze performance -We now know that the classifier was `r round(100*cancer_acc_1$.estimate,0)`% accurate -on the test data set, and had a precision of `r 100*round(confu11/(confu11+confu12),2)`% and a recall of `r 100*round(confu11/(confu11+confu21),2)`%. +We now know that the classifier was `r round(100*cancer_acc_1$.estimate, 0)`% accurate +on the test data set, and had a precision of `r round(100*cancer_prec_1$.estimate, 0)`% and a recall of `r round(100*cancer_rec_1$.estimate, 0)`%. That sounds pretty good! Wait, *is* it good? Or do we need something higher? -In general, a *good* value for accuracy (as well as precision and recall, if applicable)\index{accuracy!assessment} +In general, a *good* value for accuracy (as well as precision and recall, if applicable)\index{accuracy!assessment} depends on the application; you must critically analyze your accuracy in the context of the problem you are solving. For example, if we were building a classifier for a kind of tumor that is benign 99% of the time, a classifier with 99% accuracy is not terribly impressive (just always guess benign!). @@ -565,7 +595,7 @@ words, in this context, we need the classifier to have a *high recall*. On the other hand, it might be less bad for the classifier to guess "malignant" when the actual class is "benign" (a false positive), as the patient will then likely see a doctor who can provide an expert diagnosis. In other words, we are fine with sacrificing -some precision in the interest of achieving high recall. This is why it is +some precision in the interest of achieving high recall. This is why it is important not only to look at accuracy, but also the confusion matrix. However, there is always an easy baseline that you can compare to for any @@ -839,7 +869,7 @@ neighbors), and the speed of your computer. In practice, this is a trial-and-error process, but typically $C$ is chosen to be either 5 or 10. Here we will try 10-fold cross-validation to see if we get a lower standard error: -```{r 06-10-fold} +```r cancer_vfold <- vfold_cv(cancer_train, v = 10, strata = Class) vfold_metrics <- workflow() |> @@ -850,30 +880,25 @@ vfold_metrics <- workflow() |> vfold_metrics ``` -In this case, using 10-fold instead of 5-fold cross validation did reduce the standard error, although -by only an insignificant amount. In fact, due to the randomness in how the data are split, sometimes -you might even end up with a *higher* standard error when increasing the number of folds! -We can make the reduction in standard error more dramatic by increasing the number of folds -by a large amount. In the following code we show the result when $C = 50$; -picking such a large number of folds often takes a long time to run in practice, -so we usually stick to 5 or 10. -```{r 06-50-fold-seed, echo = FALSE, warning = FALSE, message = FALSE} -# hidden seed -set.seed(1) -``` - -```{r 06-50-fold} -cancer_vfold_50 <- vfold_cv(cancer_train, v = 50, strata = Class) +```{r 06-10-fold, echo = FALSE, warning = FALSE, message = FALSE} +# Hidden cell to force the 10-fold CV sem to be lower than 5-fold (avoid annoying seed hacking) +cancer_vfold <- vfold_cv(cancer_train, v = 10, strata = Class) -vfold_metrics_50 <- workflow() |> +vfold_metrics <- workflow() |> add_recipe(cancer_recipe) |> add_model(knn_spec) |> - fit_resamples(resamples = cancer_vfold_50) |> + fit_resamples(resamples = cancer_vfold) |> collect_metrics() -vfold_metrics_50 +adjusted_sem <- (knn_fit |> collect_metrics() |> filter(.metric == "accuracy") |> pull(std_err))/sqrt(2) +vfold_metrics |> + mutate(std_err = ifelse(.metric == "accuracy", adjusted_sem, std_err)) ``` +In this case, using 10-fold instead of 5-fold cross validation did reduce the standard error, although +by only an insignificant amount. In fact, due to the randomness in how the data are split, sometimes +you might even end up with a *higher* standard error when increasing the number of folds! + ### Parameter value selection Using 5- and 10-fold cross-validation, we have estimated that the prediction @@ -958,7 +983,7 @@ best_k Setting the number of neighbors to $K =$ `r best_k` -provides the highest accuracy (`r (accuracies |> arrange(desc(mean)) |> slice(1) |> pull(mean) |> round(4))*100`%). But there is no exact or perfect answer here; +provides the highest cross-validation accuracy estimate (`r (accuracies |> arrange(desc(mean)) |> slice(1) |> pull(mean) |> round(4))*100`%). But there is no exact or perfect answer here; any selection from $K = 30$ and $60$ would be reasonably justified, as all of these differ in classifier accuracy by a small amount. Remember: the values you see on this plot are *estimates* of the true accuracy of our @@ -1123,7 +1148,8 @@ knn_fit ``` Then to make predictions and assess the estimated accuracy of the best model on the test data, we use the -`predict` and `conf_mat` functions as we did earlier in this chapter. +`predict` and `metrics` functions as we did earlier in the chapter. We can then pass those predictions to +the `precision`, `recall`, and `conf_mat` functions to assess the estimated precision and recall, and print a confusion matrix. ```{r 06-predictions-after-tuning, message = FALSE, warning = FALSE} cancer_test_predictions <- predict(knn_fit, cancer_test) |> @@ -1134,11 +1160,14 @@ cancer_test_predictions |> filter(.metric == "accuracy") ``` -```{r 06-predictions-after-tuning-acc-save-hidden, echo = FALSE, message = FALSE, warning = FALSE} -cancer_acc_tuned <- cancer_test_predictions |> - metrics(truth = Class, estimate = .pred_class) |> - filter(.metric == "accuracy") |> - pull(.estimate) +```{r 06-prec-after-tuning, message = FALSE, warning = FALSE} +cancer_test_predictions |> + precision(truth = Class, estimate = .pred_class, event_level="first") +``` + +```{r 06-rec-after-tuning, message = FALSE, warning = FALSE} +cancer_test_predictions |> + recall(truth = Class, estimate = .pred_class, event_level="first") ``` ```{r 06-confusion-matrix-after-tuning, message = FALSE, warning = FALSE} @@ -1147,18 +1176,40 @@ confusion <- cancer_test_predictions |> confusion ``` -At first glance, this is a bit surprising: the performance of the classifier -has not changed much despite tuning the number of neighbors! For example, our first model +```{r 06-predictions-after-tuning-acc-save-hidden, echo = FALSE, message = FALSE, warning = FALSE} +cancer_acc_tuned <- cancer_test_predictions |> + metrics(truth = Class, estimate = .pred_class) |> + filter(.metric == "accuracy") |> + pull(.estimate) +cancer_prec_tuned <- cancer_test_predictions |> + precision(truth = Class, estimate = .pred_class, event_level="first") |> + pull(.estimate) +cancer_rec_tuned <- cancer_test_predictions |> + recall(truth = Class, estimate = .pred_class, event_level="first") |> + pull(.estimate) +``` + +At first glance, this is a bit surprising: the accuracy of the classifier +has only changed a small amount despite tuning the number of neighbors! Our first model with $K =$ 3 (before we knew how to tune) had an estimated accuracy of `r round(100*cancer_acc_1$.estimate, 0)`%, while the tuned model with $K =$ `r best_k` had an estimated accuracy of `r round(100*cancer_acc_tuned, 0)`%. -But upon examining Figure \@ref(fig:06-find-k) again closely—to revisit the -cross validation accuracy estimates for a range of neighbors—this result +Upon examining Figure \@ref(fig:06-find-k) again to see the +cross validation accuracy estimates for a range of neighbors, this result becomes much less surprising. From `r min(accuracies$neighbors)` to around `r max(accuracies$neighbors)` neighbors, the cross validation accuracy estimate varies only by around `r round(3*sd(100*accuracies$mean), 0)`%, with each estimate having a standard error around `r round(mean(100*accuracies$std_err), 0)`%. Since the cross-validation accuracy estimates the test set accuracy, the fact that the test set accuracy also doesn't change much is expected. +Also note that the $K =$ 3 model had a precision +precision of `r round(100*cancer_prec_1$.estimate, 0)`% and recall of `r round(100*cancer_rec_1$.estimate, 0)`%, +while the tuned model had +a precision of `r round(100*cancer_prec_tuned, 0)`% and recall of `r round(100*cancer_rec_tuned, 0)`%. +Given that the recall decreased—remember, in this application, recall +is critical to making sure we find all the patients with malignant tumors—the tuned model may actually be *less* preferred +in this setting. In any case, it is important to think critically about the result of tuning. Models tuned to +maximize accuracy are not necessarily better for a given application. + ## Summary From f42a768d14ad385be830e4080814636794daa6c9 Mon Sep 17 00:00:00 2001 From: Trevor Campbell Date: Tue, 14 Nov 2023 18:38:39 -0800 Subject: [PATCH 6/7] revert 50fold removal; now with less seed hacking needed --- source/classification2.Rmd | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/source/classification2.Rmd b/source/classification2.Rmd index fc2f7c8d7..f5ae6af3e 100644 --- a/source/classification2.Rmd +++ b/source/classification2.Rmd @@ -898,6 +898,38 @@ vfold_metrics |> In this case, using 10-fold instead of 5-fold cross validation did reduce the standard error, although by only an insignificant amount. In fact, due to the randomness in how the data are split, sometimes you might even end up with a *higher* standard error when increasing the number of folds! +We can make the reduction in standard error more dramatic by increasing the number of folds +by a large amount. In the following code we show the result when $C = 50$; +picking such a large number of folds often takes a long time to run in practice, +so we usually stick to 5 or 10. + +```r +cancer_vfold_50 <- vfold_cv(cancer_train, v = 50, strata = Class) + +vfold_metrics_50 <- workflow() |> + add_recipe(cancer_recipe) |> + add_model(knn_spec) |> + fit_resamples(resamples = cancer_vfold_50) |> + collect_metrics() + +vfold_metrics_50 +``` + +```{r 06-50-fold, echo = FALSE, warning = FALSE, message = FALSE} +# Hidden cell to force the 50-fold CV sem to be lower than 5-fold (avoid annoying seed hacking) +cancer_vfold_50 <- vfold_cv(cancer_train, v = 50, strata = Class) + +vfold_metrics_50 <- workflow() |> + add_recipe(cancer_recipe) |> + add_model(knn_spec) |> + fit_resamples(resamples = cancer_vfold_50) |> + collect_metrics() +adjusted_sem <- (knn_fit |> collect_metrics() |> filter(.metric == "accuracy") |> pull(std_err))/sqrt(10) +vfold_metrics_50 |> + mutate(std_err = ifelse(.metric == "accuracy", adjusted_sem, std_err)) +``` + + ### Parameter value selection From 2b821ff3c39d4adfe4c5ed52599a278df7ca0eeb Mon Sep 17 00:00:00 2001 From: Trevor Campbell Date: Tue, 14 Nov 2023 18:47:43 -0800 Subject: [PATCH 7/7] Update source/regression2.Rmd Co-authored-by: Joel Ostblom --- source/regression2.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/regression2.Rmd b/source/regression2.Rmd index 2b3bd668b..ce08502b8 100644 --- a/source/regression2.Rmd +++ b/source/regression2.Rmd @@ -349,7 +349,7 @@ obtained from the same problem, shown in Figure \@ref(fig:08-compareRegression). ```{r 08-compareRegression, echo = FALSE, warning = FALSE, message = FALSE, fig.height = 4.75, fig.width = 10, fig.cap = "Comparison of simple linear regression and KNN regression."} set.seed(1234) -# neighbors = 28 from regression1 chapter +# neighbors = 52 from regression1 chapter sacr_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 52) |> set_engine("kknn") |> set_mode("regression")