Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sms-spam submission #5

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 270 additions & 0 deletions sms_spam.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB\n",
"from sklearn.pipeline import Pipeline\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\r\n",
"ham\tOk lar... Joking wif u oni...\r\n",
"spam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\r\n",
"ham\tU dun say so early hor... U c already then say...\r\n",
"ham\tNah I don't think he goes to usf, he lives around here though\r\n",
"spam\tFreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv\r\n",
"ham\tEven my brother is not like to speak with me. They treat me like aids patent.\r\n",
"ham\tAs per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\r\n",
"spam\tWINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.\r\n",
"spam\tHad your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030\r\n"
]
}
],
"source": [
"!head ./smsspamcollection/SMSSpamCollection"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ham_spam</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ham</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ham_spam text\n",
"0 ham Go until jurong point, crazy.. Available only ...\n",
"1 ham Ok lar... Joking wif u oni...\n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 ham U dun say so early hor... U c already then say...\n",
"4 ham Nah I don't think he goes to usf, he lives aro..."
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = ['ham_spam', 'text']\n",
"df = pd.read_csv('./smsspamcollection/SMSSpamCollection', sep='\\t', names=columns)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def test_classifier(pipe, *split_args):\n",
" #classifier = classifierType()\n",
" pipe.fit(split_args[0], split_args[2])\n",
"# predicted = classifier.predict(X_test)\n",
" train_score = pipe.score(split_args[0], split_args[2])\n",
" test_score = pipe.score(split_args[1], split_args[3])\n",
" print('Train score: {}, Test score: {}'.format(train_score, test_score))\n",
" return pipe"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = df.text\n",
"y = df.ham_spam\n",
"args = train_test_split(X, y, test_size=0.4, random_state=0) # X_train, X_test, y_train, y_test\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train score: 0.9934190846545019, Test score: 0.9856437864513234\n"
]
}
],
"source": [
"spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),\n",
"# ('tfidf', TfidfTransformer()),\n",
" ('bayes', MultinomialNB())])\n",
"spam_pipe\n",
"classifier = test_classifier(spam_pipe, *args)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train score: 0.9650014956625785, Test score: 0.9587258860475549\n",
"Not as good with Tfidf in the pipeline\n"
]
}
],
"source": [
"spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),\n",
" ('tfidf', TfidfTransformer()),\n",
" ('bayes', MultinomialNB())])\n",
"spam_pipe\n",
"classifier = test_classifier(spam_pipe, *args)\n",
"print('Not as good with Tfidf in the pipeline')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}