case-study-text-classification-spam-and-ham-.html

<!DOCTYPE html>
<html >

<head>

  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <title>Chapter 15 Case Study - Text classification: Spam and Ham. | Machine Learning with R</title>
  <meta name="description" content="This book is about using R for machine learning purposes.">
  <meta name="generator" content="bookdown  and GitBook 2.6.7">

  <meta property="og:title" content="Chapter 15 Case Study - Text classification: Spam and Ham. | Machine Learning with R" />
  <meta property="og:type" content="book" />
  
  
  <meta property="og:description" content="This book is about using R for machine learning purposes." />
  <meta name="github-repo" content="fderyckel/machinelearningwithr" />

  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="Chapter 15 Case Study - Text classification: Spam and Ham. | Machine Learning with R" />
  
  <meta name="twitter:description" content="This book is about using R for machine learning purposes." />
  

<meta name="author" content="François de Ryckel">


<meta name="date" content="2019-02-23">

  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="apple-mobile-web-app-status-bar-style" content="black">
  
  
<link rel="prev" href="model-evaluation.html">
<link rel="next" href="mushroom.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />


<script src="libs/kePrint-0.0.1/kePrint.js"></script>


<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
  margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>

<link rel="stylesheet" href="style.css" type="text/css" />
</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li><strong><a href="./">Machine Learning with R</a></strong></li>

<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Prerequisites</a><ul>
<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#pre-requisite-and-conventions"><i class="fa fa-check"></i><b>1.1</b> Pre-requisite and conventions</a></li>
<li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#organization"><i class="fa fa-check"></i><b>1.2</b> Organization</a></li>
<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#packages"><i class="fa fa-check"></i><b>1.3</b> Packages</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="testinference.html"><a href="testinference.html"><i class="fa fa-check"></i><b>2</b> Tests and inferences</a><ul>
<li class="chapter" data-level="2.1" data-path="testinference.html"><a href="testinference.html#normality"><i class="fa fa-check"></i><b>2.1</b> Assumption of normality</a><ul>
<li class="chapter" data-level="2.1.1" data-path="testinference.html"><a href="testinference.html#visual-check-of-normality"><i class="fa fa-check"></i><b>2.1.1</b> Visual check of normality</a></li>
<li class="chapter" data-level="2.1.2" data-path="testinference.html"><a href="testinference.html#normality-tests"><i class="fa fa-check"></i><b>2.1.2</b> Normality tests</a></li>
</ul></li>
<li class="chapter" data-level="2.2" data-path="testinference.html"><a href="testinference.html#ttest"><i class="fa fa-check"></i><b>2.2</b> T-tests</a></li>
<li class="chapter" data-level="2.3" data-path="testinference.html"><a href="testinference.html#anova---analyse-of-variance."><i class="fa fa-check"></i><b>2.3</b> ANOVA - Analyse of variance.</a></li>
<li class="chapter" data-level="2.4" data-path="testinference.html"><a href="testinference.html#covariance"><i class="fa fa-check"></i><b>2.4</b> Covariance</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="mlr.html"><a href="mlr.html"><i class="fa fa-check"></i><b>3</b> Single &amp; Multiple Linear Regression</a><ul>
<li class="chapter" data-level="3.1" data-path="mlr.html"><a href="mlr.html#single-variable-regression"><i class="fa fa-check"></i><b>3.1</b> Single variable regression</a></li>
<li class="chapter" data-level="3.2" data-path="mlr.html"><a href="mlr.html#multi-variables-regression"><i class="fa fa-check"></i><b>3.2</b> Multi-variables regression</a><ul>
<li class="chapter" data-level="3.2.1" data-path="mlr.html"><a href="mlr.html#predicting-wine-price-again"><i class="fa fa-check"></i><b>3.2.1</b> Predicting wine price (again!)</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="mlr.html"><a href="mlr.html#model-diagnostic-and-evaluation"><i class="fa fa-check"></i><b>3.3</b> Model diagnostic and evaluation</a></li>
<li class="chapter" data-level="3.4" data-path="mlr.html"><a href="mlr.html#final-example---boston-dataset---with-backward-elimination"><i class="fa fa-check"></i><b>3.4</b> Final example - Boston dataset - with backward elimination</a><ul>
<li class="chapter" data-level="3.4.1" data-path="mlr.html"><a href="mlr.html#model-diagmostic"><i class="fa fa-check"></i><b>3.4.1</b> Model diagmostic</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="mlr.html"><a href="mlr.html#references"><i class="fa fa-check"></i><b>3.5</b> References</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="logistic.html"><a href="logistic.html"><i class="fa fa-check"></i><b>4</b> Logistic Regression</a><ul>
<li class="chapter" data-level="4.1" data-path="logistic.html"><a href="logistic.html#introduction"><i class="fa fa-check"></i><b>4.1</b> Introduction</a></li>
<li class="chapter" data-level="4.2" data-path="logistic.html"><a href="logistic.html#the-logistic-equation."><i class="fa fa-check"></i><b>4.2</b> The logistic equation.</a></li>
<li class="chapter" data-level="4.3" data-path="logistic.html"><a href="logistic.html#performance-of-logistic-regression-model"><i class="fa fa-check"></i><b>4.3</b> Performance of Logistic Regression Model</a></li>
<li class="chapter" data-level="4.4" data-path="logistic.html"><a href="logistic.html#setting-up"><i class="fa fa-check"></i><b>4.4</b> Setting up</a></li>
<li class="chapter" data-level="4.5" data-path="logistic.html"><a href="logistic.html#example-1---graduate-admission"><i class="fa fa-check"></i><b>4.5</b> Example 1 - Graduate Admission</a></li>
<li class="chapter" data-level="4.6" data-path="logistic.html"><a href="logistic.html#example-2---diabetes"><i class="fa fa-check"></i><b>4.6</b> Example 2 - Diabetes</a><ul>
<li class="chapter" data-level="4.6.1" data-path="logistic.html"><a href="logistic.html#accounting-for-missing-values"><i class="fa fa-check"></i><b>4.6.1</b> Accounting for missing values</a></li>
<li class="chapter" data-level="4.6.2" data-path="logistic.html"><a href="logistic.html#imputting-missing-values"><i class="fa fa-check"></i><b>4.6.2</b> Imputting Missing Values</a></li>
<li class="chapter" data-level="4.6.3" data-path="logistic.html"><a href="logistic.html#roc-and-auc"><i class="fa fa-check"></i><b>4.6.3</b> ROC and AUC</a></li>
</ul></li>
<li class="chapter" data-level="4.7" data-path="logistic.html"><a href="logistic.html#references-1"><i class="fa fa-check"></i><b>4.7</b> References</a></li>
</ul></li>
<li class="chapter" data-level="5" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html"><i class="fa fa-check"></i><b>5</b> Softmax and multinomial regressions</a><ul>
<li class="chapter" data-level="5.1" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html#multinomial-logistic-regression"><i class="fa fa-check"></i><b>5.1</b> Multinomial Logistic Regression</a></li>
<li class="chapter" data-level="5.2" data-path="softmax-and-multinomial-regressions.html"><a href="softmax-and-multinomial-regressions.html#references-2"><i class="fa fa-check"></i><b>5.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="gradient-descent.html"><a href="gradient-descent.html"><i class="fa fa-check"></i><b>6</b> Gradient Descent</a><ul>
<li class="chapter" data-level="6.1" data-path="gradient-descent.html"><a href="gradient-descent.html#example-on-functions"><i class="fa fa-check"></i><b>6.1</b> Example on functions</a></li>
<li class="chapter" data-level="6.2" data-path="gradient-descent.html"><a href="gradient-descent.html#example-on-regressions"><i class="fa fa-check"></i><b>6.2</b> Example on regressions</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="knnchapter.html"><a href="knnchapter.html"><i class="fa fa-check"></i><b>7</b> KNN - K Nearest Neighbour</a><ul>
<li class="chapter" data-level="7.1" data-path="knnchapter.html"><a href="knnchapter.html#example-1.-prostate-cancer-dataset"><i class="fa fa-check"></i><b>7.1</b> Example 1. Prostate Cancer dataset</a></li>
<li class="chapter" data-level="7.2" data-path="knnchapter.html"><a href="knnchapter.html#example-2.-wine-dataset"><i class="fa fa-check"></i><b>7.2</b> Example 2. Wine dataset</a><ul>
<li class="chapter" data-level="7.2.1" data-path="knnchapter.html"><a href="knnchapter.html#understand-the-data"><i class="fa fa-check"></i><b>7.2.1</b> Understand the data</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="knnchapter.html"><a href="knnchapter.html#references-3"><i class="fa fa-check"></i><b>7.3</b> References</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="kmeans.html"><a href="kmeans.html"><i class="fa fa-check"></i><b>8</b> Kmeans clustering</a><ul>
<li class="chapter" data-level="8.1" data-path="kmeans.html"><a href="kmeans.html#multinomial-logistic-regression-1"><i class="fa fa-check"></i><b>8.1</b> Multinomial Logistic Regression</a></li>
<li class="chapter" data-level="8.2" data-path="kmeans.html"><a href="kmeans.html#references-4"><i class="fa fa-check"></i><b>8.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="hierclust.html"><a href="hierclust.html"><i class="fa fa-check"></i><b>9</b> Hierarichal Clustering</a><ul>
<li class="chapter" data-level="9.1" data-path="hierclust.html"><a href="hierclust.html#example-on-the-pokemon-dataset"><i class="fa fa-check"></i><b>9.1</b> Example on the Pokemon dataset</a></li>
<li class="chapter" data-level="9.2" data-path="hierclust.html"><a href="hierclust.html#example-on-regressions-1"><i class="fa fa-check"></i><b>9.2</b> Example on regressions</a></li>
<li class="chapter" data-level="9.3" data-path="hierclust.html"><a href="hierclust.html#references-5"><i class="fa fa-check"></i><b>9.3</b> References</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="pca.html"><a href="pca.html"><i class="fa fa-check"></i><b>10</b> Principal Component Analysis</a><ul>
<li class="chapter" data-level="10.1" data-path="pca.html"><a href="pca.html#pca-on-an-easy-example."><i class="fa fa-check"></i><b>10.1</b> PCA on an easy example.</a></li>
<li class="chapter" data-level="10.2" data-path="pca.html"><a href="pca.html#references."><i class="fa fa-check"></i><b>10.2</b> References.</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="trees-and-classification.html"><a href="trees-and-classification.html"><i class="fa fa-check"></i><b>11</b> Trees and Classification</a><ul>
<li class="chapter" data-level="11.1" data-path="trees-and-classification.html"><a href="trees-and-classification.html#introduction-1"><i class="fa fa-check"></i><b>11.1</b> Introduction</a></li>
<li class="chapter" data-level="11.2" data-path="trees-and-classification.html"><a href="trees-and-classification.html#first-example."><i class="fa fa-check"></i><b>11.2</b> First example.</a></li>
<li class="chapter" data-level="11.3" data-path="trees-and-classification.html"><a href="trees-and-classification.html#second-example."><i class="fa fa-check"></i><b>11.3</b> Second Example.</a></li>
<li class="chapter" data-level="11.4" data-path="trees-and-classification.html"><a href="trees-and-classification.html#how-does-a-tree-decide-where-to-split"><i class="fa fa-check"></i><b>11.4</b> How does a tree decide where to split?</a></li>
<li class="chapter" data-level="11.5" data-path="trees-and-classification.html"><a href="trees-and-classification.html#third-example."><i class="fa fa-check"></i><b>11.5</b> Third example.</a></li>
<li class="chapter" data-level="11.6" data-path="trees-and-classification.html"><a href="trees-and-classification.html#references-6"><i class="fa fa-check"></i><b>11.6</b> References</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="random-forest.html"><a href="random-forest.html"><i class="fa fa-check"></i><b>12</b> Random Forest</a><ul>
<li class="chapter" data-level="12.1" data-path="random-forest.html"><a href="random-forest.html#how-does-it-work"><i class="fa fa-check"></i><b>12.1</b> How does it work?</a></li>
<li class="chapter" data-level="12.2" data-path="random-forest.html"><a href="random-forest.html#references-7"><i class="fa fa-check"></i><b>12.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="13" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>13</b> Support Vector Machine</a><ul>
<li class="chapter" data-level="13.1" data-path="svm.html"><a href="svm.html#support-vecotr-regression"><i class="fa fa-check"></i><b>13.1</b> Support Vecotr Regression</a><ul>
<li class="chapter" data-level="13.1.1" data-path="svm.html"><a href="svm.html#create-data"><i class="fa fa-check"></i><b>13.1.1</b> Create data</a></li>
<li class="chapter" data-level="13.1.2" data-path="svm.html"><a href="svm.html#tuning-a-svm-model"><i class="fa fa-check"></i><b>13.1.2</b> Tuning a SVM model</a></li>
<li class="chapter" data-level="13.1.3" data-path="svm.html"><a href="svm.html#discussion-on-parameters"><i class="fa fa-check"></i><b>13.1.3</b> Discussion on parameters</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="svm.html"><a href="svm.html#references-8"><i class="fa fa-check"></i><b>13.2</b> References</a></li>
</ul></li>
<li class="chapter" data-level="14" data-path="model-evaluation.html"><a href="model-evaluation.html"><i class="fa fa-check"></i><b>14</b> Model Evaluation</a><ul>
<li class="chapter" data-level="14.1" data-path="model-evaluation.html"><a href="model-evaluation.html#biais-variance-tradeoff"><i class="fa fa-check"></i><b>14.1</b> Biais variance tradeoff</a></li>
<li class="chapter" data-level="14.2" data-path="model-evaluation.html"><a href="model-evaluation.html#bagging"><i class="fa fa-check"></i><b>14.2</b> Bagging</a></li>
<li class="chapter" data-level="14.3" data-path="model-evaluation.html"><a href="model-evaluation.html#crossvalidation"><i class="fa fa-check"></i><b>14.3</b> Cross Validation</a></li>
</ul></li>
<li class="chapter" data-level="15" data-path="case-study-text-classification-spam-and-ham-.html"><a href="case-study-text-classification-spam-and-ham-.html"><i class="fa fa-check"></i><b>15</b> Case Study - Text classification: Spam and Ham.</a></li>
<li class="chapter" data-level="16" data-path="mushroom.html"><a href="mushroom.html"><i class="fa fa-check"></i><b>16</b> Case Study - Mushrooms Classification</a><ul>
<li class="chapter" data-level="16.1" data-path="mushroom.html"><a href="mushroom.html#import-the-data"><i class="fa fa-check"></i><b>16.1</b> Import the data</a></li>
<li class="chapter" data-level="16.2" data-path="mushroom.html"><a href="mushroom.html#tidy-the-data"><i class="fa fa-check"></i><b>16.2</b> Tidy the data</a></li>
<li class="chapter" data-level="16.3" data-path="mushroom.html"><a href="mushroom.html#understand-the-data-1"><i class="fa fa-check"></i><b>16.3</b> Understand the data</a><ul>
<li class="chapter" data-level="16.3.1" data-path="mushroom.html"><a href="mushroom.html#transform-the-data"><i class="fa fa-check"></i><b>16.3.1</b> Transform the data</a></li>
<li class="chapter" data-level="16.3.2" data-path="mushroom.html"><a href="mushroom.html#visualize-the-data"><i class="fa fa-check"></i><b>16.3.2</b> Visualize the data</a></li>
<li class="chapter" data-level="16.3.3" data-path="mushroom.html"><a href="mushroom.html#modeling"><i class="fa fa-check"></i><b>16.3.3</b> Modeling</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="mushroom.html"><a href="mushroom.html#communication"><i class="fa fa-check"></i><b>16.4</b> Communication</a></li>
</ul></li>
<li class="chapter" data-level="17" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html"><i class="fa fa-check"></i><b>17</b> Case study - The adults dataset.</a><ul>
<li class="chapter" data-level="17.1" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#introduction-2"><i class="fa fa-check"></i><b>17.1</b> Introduction</a></li>
<li class="chapter" data-level="17.2" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#import-the-data-1"><i class="fa fa-check"></i><b>17.2</b> Import the data</a></li>
<li class="chapter" data-level="17.3" data-path="case-study-the-adults-dataset-.html"><a href="case-study-the-adults-dataset-.html#tidy-the-data-1"><i class="fa fa-check"></i><b>17.3</b> Tidy the data</a></li>
</ul></li>
<li class="chapter" data-level="18" data-path="breastcancer.html"><a href="breastcancer.html"><i class="fa fa-check"></i><b>18</b> Case Study - Wisconsin Breast Cancer</a><ul>
<li class="chapter" data-level="18.1" data-path="breastcancer.html"><a href="breastcancer.html#import-the-data-2"><i class="fa fa-check"></i><b>18.1</b> Import the data</a></li>
<li class="chapter" data-level="18.2" data-path="breastcancer.html"><a href="breastcancer.html#tidy-the-data-2"><i class="fa fa-check"></i><b>18.2</b> Tidy the data</a></li>
<li class="chapter" data-level="18.3" data-path="breastcancer.html"><a href="breastcancer.html#understand-the-data-2"><i class="fa fa-check"></i><b>18.3</b> Understand the data</a><ul>
<li class="chapter" data-level="18.3.1" data-path="breastcancer.html"><a href="breastcancer.html#transform-the-data-1"><i class="fa fa-check"></i><b>18.3.1</b> Transform the data</a></li>
<li class="chapter" data-level="18.3.2" data-path="breastcancer.html"><a href="breastcancer.html#pre-process-the-data"><i class="fa fa-check"></i><b>18.3.2</b> Pre-process the data</a></li>
<li class="chapter" data-level="18.3.3" data-path="breastcancer.html"><a href="breastcancer.html#model-the-data-1"><i class="fa fa-check"></i><b>18.3.3</b> Model the data</a></li>
</ul></li>
<li class="chapter" data-level="18.4" data-path="breastcancer.html"><a href="breastcancer.html#references-9"><i class="fa fa-check"></i><b>18.4</b> References</a></li>
</ul></li>
<li class="chapter" data-level="19" data-path="final-words.html"><a href="final-words.html"><i class="fa fa-check"></i><b>19</b> Final Words</a></li>
<li class="chapter" data-level="" data-path="references-10.html"><a href="references-10.html"><i class="fa fa-check"></i>References</a></li>
</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning with R</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div id="case-study---text-classification-spam-and-ham." class="section level1">
<h1><span class="header-section-number">Chapter 15</span> Case Study - Text classification: Spam and Ham.</h1>
<p>This chapter has been inspired by the Coursera course on <a href="https://www.coursera.org/learn/ml-foundations/">Machine Learning Foundations: A Case Study Approach</a> given by Carlos Guestrin and by Emily Fox from Washington University. This course is part of the <a href="https://www.coursera.org/specializations/machine-learning">Machine Learning Specialization</a></p>
<p>The task was to apply classfification on an Amazon review dataset. Given a review, we create a model that will decide if the review is <em>positive</em> (associated with a rating of 4 or 5) or <em>negative</em> (associate with a rating of 1 or 2). This is a supervised learning task as the grading associated with the reviews is used as the response variable.</p>
<p>What we have done here is to create a subset of the dataset with only one product. The <em>Philips Avent Bottle</em>.</p>
<p>As usual, let’s first load the libraries</p>
<p>Let’s have a quick look at our data.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df &lt;-<span class="st"> </span><span class="kw">read_csv</span>(<span class="st">&quot;dataset/toyamazonPhilips.csv&quot;</span>)
df &lt;-<span class="st"> </span><span class="kw">as_tibble</span>(df)
df2 &lt;-<span class="st"> </span>df[,<span class="dv">2</span><span class="op">:</span><span class="dv">3</span>]

<span class="co">#Let&#39;s have a quick look at the reviews</span>
<span class="kw">library</span>(pander)
<span class="kw">pandoc.table</span>(df2[<span class="dv">1</span><span class="op">:</span><span class="dv">3</span>,], 
             <span class="dt">justify =</span> <span class="st">&#39;left&#39;</span>, <span class="dt">style =</span> <span class="st">&#39;grid&#39;</span>)</code></pre></div>
<pre><code>## 
## 
## +--------------------------------+--------+
## | review                         | rating |
## +================================+========+
## | I was recommended to use these | 5      |
## | bottles by a girlfriend who    |        |
## | had used them with her baby    |        |
## | and sworn by them. I started   |        |
## | with a set of 4oz bottles with |        |
## | newborn nipples (#1) and have  |        |
## | graduated now to the 9oz       |        |
## | bottles with the next size up  |        |
## | nipples (#2). I will simply    |        |
## | buy new nipples when she&#39;s     |        |
## | ready for the next step. I am  |        |
## | absolutely thrilled with these |        |
## | bottles.  I do have an         |        |
## | occasional leak but it is      |        |
## | always my own fault for not    |        |
## | screwing the lid on tight      |        |
## | enough!I have not had any      |        |
## | problems with leaking when I   |        |
## | have them put together         |        |
## | correctly. They were           |        |
## | especially good for going back |        |
## | and forth between breast       |        |
## | feeding and bottle feeding. My |        |
## | now 4mos old baby never had    |        |
## | any difficulties feeding       |        |
## | either way and transitioned    |        |
## | very smoothly. One reader      |        |
## | referred to air being pushed   |        |
## | into the bottle during feeding |        |
## | and causing her baby to        |        |
## | swallow air. The air wooshing  |        |
## | in is a good thing! It         |        |
## | prevents a vacuum from         |        |
## | developing inside the bottle   |        |
## | and allows the baby to         |        |
## | continuously feed without      |        |
## | having to stop to relieve the  |        |
## | pressure inside the bottle.    |        |
## | The only time I ever had any   |        |
## | problems with mine swallowing  |        |
## | air was when she had pretty    |        |
## | much outgrown the newborn      |        |
## | nipples and would try to suck  |        |
## | too hard when she was very     |        |
## | hungry. This was immediately   |        |
## | corrected by buying the next   |        |
## | size up. They&#39;re also very     |        |
## | easy to clean and can be       |        |
## | effectively washed in the      |        |
## | dishwasher because of their    |        |
## | wide neck. Regular bottles     |        |
## | cannot. I, out of necessity,   |        |
## | used another brand of bottle   |        |
## | with her today at a relative&#39;s |        |
## | house and was reminded why I   |        |
## | like Avent bottles so much!    |        |
## +--------------------------------+--------+
## | If I had not been given a ton  | 2      |
## | of Avent bottles, I would have |        |
## | chosen some other system.  The |        |
## | leaking is terrible!!!  You    |        |
## | have to buy the disks          |        |
## | separately, you should get     |        |
## | them for free because they are |        |
## | absolutely essential.  The     |        |
## | only way to mix formula in the |        |
## | bottle or transport liquid is  |        |
## | to use the disks in the ring,  |        |
## | then switch to the nipple when |        |
## | you are ready to feed.  The    |        |
## | only reason I give it a two is |        |
## | because I do like that you can |        |
## | pump directly into the bottle  |        |
## | with the ISIS breast pump.     |        |
## | And, I like the sippy cups.    |        |
## +--------------------------------+--------+
## | Leaks! Especially difficult to | 1      |
## | get a tight seal if you use    |        |
## | one hand (while holding baby). |        |
## | A much better design is the    |        |
## | Breast Flow Learning Curve     |        |
## | First Years bottles. Instead   |        |
## | buy The First Years 3pk.       |        |
## | Breastflow 5oz. Bottles These  |        |
## | worked much better for me.     |        |
## +--------------------------------+--------+</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co">#Let&#39;s see the table of ratings.  </span>
<span class="kw">table</span>(df2<span class="op">$</span>rating)</code></pre></div>
<pre><code>## 
##  1  2  3  4  5 
## 45 33 17 30 66</code></pre>
<p>Interestingly the ratings on the Avent Bottles are quite spread on the extreme. It might be that people only write reviews if they are super excited or very frustrated with a product. Because we want this to be a binary classification exercise, we’ll do some transformation on these ratings. First we combine the positive reviews together (the 4 and 5 ratings) and the negative reviews together (the 1 and 2 ratings). Then we take out the neutral reviews.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># We&#39;ll put a 1 for great reviews (4 or 5) or a 0 for bad reviews (1 or 2)</span>
<span class="co"># We remove all the reviews that have a rating of 3</span>
df2 &lt;-<span class="st"> </span>df <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">filter</span>(rating <span class="op">!=</span><span class="st"> </span><span class="dv">3</span>) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st">                    </span><span class="kw">mutate</span>(<span class="dt">rating_new =</span> <span class="kw">if_else</span>(rating <span class="op">&gt;=</span><span class="st"> </span><span class="dv">4</span>, <span class="dv">1</span>, <span class="dv">0</span>))
df_training &lt;-<span class="st">  </span>df2[<span class="dv">1</span><span class="op">:</span><span class="dv">150</span>, ]</code></pre></div>
<p>Now we create our corpus, then tokenize it, then make it back to a data frame.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(tm)
corpus_toy &lt;-<span class="st"> </span><span class="kw">Corpus</span>(<span class="kw">VectorSource</span>(df_training<span class="op">$</span>review))
tdm_toy &lt;-<span class="st"> </span><span class="kw">DocumentTermMatrix</span>(corpus_toy, <span class="kw">list</span>(<span class="dt">removePunctuation =</span> <span class="ot">TRUE</span>, 
                                               <span class="dt">removeNumbers =</span> <span class="ot">TRUE</span>))

training_set_toy &lt;-<span class="st"> </span><span class="kw">as.matrix</span>(tdm_toy)

training_set_toy &lt;-<span class="st"> </span><span class="kw">cbind</span>(training_set_toy, df_training<span class="op">$</span>rating_new)

<span class="kw">colnames</span>(training_set_toy)[<span class="kw">ncol</span>(training_set_toy)] &lt;-<span class="st"> &quot;y&quot;</span>

training_set_toy &lt;-<span class="st"> </span><span class="kw">as.data.frame</span>(training_set_toy)
training_set_toy<span class="op">$</span>y &lt;-<span class="st"> </span><span class="kw">as.factor</span>(training_set_toy<span class="op">$</span>y)</code></pre></div>
<p>Now that we have our data frame ready, let’s create our model using the <code>svmLinear3</code> method.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">review_toy_model &lt;-<span class="st"> </span>caret<span class="op">::</span><span class="kw">train</span>(y <span class="op">~</span>., <span class="dt">data =</span> training_set_toy, <span class="dt">method =</span> <span class="st">&#39;svmLinear3&#39;</span>)</code></pre></div>
<p>Now we try our model on new review data</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">test_review_data &lt;-<span class="st"> </span>df2[<span class="dv">151</span><span class="op">:</span><span class="dv">174</span>, ]

test_corpus &lt;-<span class="st"> </span><span class="kw">Corpus</span>(<span class="kw">VectorSource</span>(test_review_data<span class="op">$</span>review))
test_tdm &lt;-<span class="st"> </span><span class="kw">DocumentTermMatrix</span>(test_corpus, <span class="dt">control=</span><span class="kw">list</span>(<span class="dt">dictionary =</span> <span class="kw">Terms</span>(tdm_toy)))
test_tdm &lt;-<span class="st"> </span><span class="kw">as.matrix</span>(test_tdm)

<span class="co">#Build the prediction  </span>
model_toy_result &lt;-<span class="st"> </span><span class="kw">predict</span>(review_toy_model, <span class="dt">newdata =</span> test_tdm)

check_accuracy &lt;-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">cbind</span>(<span class="dt">prediction =</span> model_toy_result, 
                                      <span class="dt">rating =</span> test_review_data<span class="op">$</span>rating_new))

check_accuracy &lt;-<span class="st"> </span>check_accuracy <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">prediction =</span> <span class="kw">as.integer</span>(prediction) <span class="op">-</span><span class="st"> </span><span class="dv">1</span>)

check_accuracy<span class="op">$</span>accuracy &lt;-<span class="st"> </span><span class="kw">if_else</span>(check_accuracy<span class="op">$</span>prediction <span class="op">==</span><span class="st"> </span>check_accuracy<span class="op">$</span>rating, <span class="dv">1</span>, <span class="dv">0</span>)
<span class="kw">round</span>(<span class="kw">prop.table</span>(<span class="kw">table</span>(check_accuracy<span class="op">$</span>accuracy)), <span class="dv">3</span>)</code></pre></div>
<pre><code>## 
##     0     1 
## 0.208 0.792</code></pre>
<p>Another way to deal with text classification is to use the <code>RtextTool</code> library.<br />
We can use the same dataframe that we used in our previous method. Like before we “DocumentTermMatrix”, we create a matrix of terms</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(RTextTools)
product_review_matrix &lt;-<span class="st"> </span><span class="kw">create_matrix</span>(df2[,<span class="dv">2</span>], <span class="dt">language =</span> <span class="st">&quot;English&quot;</span>, 
                                       <span class="dt">removeNumbers =</span> <span class="ot">TRUE</span>, 
                                       <span class="dt">removePunctuation =</span> <span class="ot">TRUE</span>, 
                                       <span class="dt">removeStopwords =</span> <span class="ot">FALSE</span>, <span class="dt">stemWords =</span> <span class="ot">FALSE</span>)

product_review_container &lt;-<span class="st"> </span><span class="kw">create_container</span>(product_review_matrix,
                                             df2<span class="op">$</span>rating_new, 
                                             <span class="dt">trainSize =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">150</span>, <span class="dt">testSize =</span> <span class="dv">151</span><span class="op">:</span><span class="dv">174</span>, 
                                             <span class="dt">virgin =</span> <span class="ot">FALSE</span>)

product_review_model &lt;-<span class="st"> </span><span class="kw">train_model</span>(product_review_container, <span class="dt">algorithm =</span> <span class="st">&quot;SVM&quot;</span>)

product_review_model_result &lt;-<span class="st"> </span><span class="kw">classify_model</span>(product_review_container, product_review_model)
x &lt;-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">cbind</span>(df2<span class="op">$</span>rating_new[<span class="dv">151</span><span class="op">:</span><span class="dv">174</span>], product_review_model_result<span class="op">$</span>SVM_LABEL))
<span class="kw">colnames</span>(x) &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;actual_ratings&quot;</span>, <span class="st">&quot;predicted_ratings&quot;</span>)
x &lt;-<span class="st"> </span>x <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">predicted_ratings =</span> predicted_ratings <span class="op">-</span><span class="st"> </span><span class="dv">1</span>)
<span class="kw">round</span>(<span class="kw">prop.table</span>(<span class="kw">table</span>(x<span class="op">$</span>actual_ratings <span class="op">==</span><span class="st"> </span>x<span class="op">$</span>predicted_ratings)), <span class="dv">3</span>)</code></pre></div>
<pre><code>## 
## FALSE  TRUE 
##  0.25  0.75</code></pre>

</div>
            </section>

          </div>
        </div>
      </div>
<a href="model-evaluation.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="mushroom.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": "https://github.com/fderyckel/machinelearningwithr/edit/master/23-spam_and_ham.Rmd",
"text": "Suggest edit to this page"
},
"history": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section"
}
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "";
    if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:" && /^https?:/.test(src))
      src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>