From 965a22241a1c476dc18729f3f833606c0ba98866 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Sun, 21 Oct 2018 21:04:27 -0400 Subject: [PATCH] Update notebook --- BOW_TFIDF_Xgboost_update.ipynb | 647 +++++++++++++++++++++++++++++++++ 1 file changed, 647 insertions(+) create mode 100644 BOW_TFIDF_Xgboost_update.ipynb diff --git a/BOW_TFIDF_Xgboost_update.ipynb b/BOW_TFIDF_Xgboost_update.ipynb new file mode 100644 index 0000000..8c5d2dc --- /dev/null +++ b/BOW_TFIDF_Xgboost_update.ipynb @@ -0,0 +1,647 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", + "from sklearn import linear_model\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "import scipy\n", + "from sklearn.metrics import log_loss\n", + "import xgboost as xgb\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import roc_auc_score\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idqid1qid2question1question2is_duplicate
0012What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...0
1134What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...0
2256How can I increase the speed of my internet co...How can Internet speed be increased by hacking...0
3378Why am I mentally very lonely? How can I solve...Find the remainder when [math]23^{24}[/math] i...0
44910Which one dissolve in water quikly sugar, salt...Which fish would survive in salt water?0
\n", + "
" + ], + "text/plain": [ + " id qid1 qid2 question1 \\\n", + "0 0 1 2 What is the step by step guide to invest in sh... \n", + "1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", + "2 2 5 6 How can I increase the speed of my internet co... \n", + "3 3 7 8 Why am I mentally very lonely? How can I solve... \n", + "4 4 9 10 Which one dissolve in water quikly sugar, salt... \n", + "\n", + " question2 is_duplicate \n", + "0 What is the step by step guide to invest in sh... 0 \n", + "1 What would happen if the Indian government sto... 0 \n", + "2 How can Internet speed be increased by hacking... 0 \n", + "3 Find the remainder when [math]23^{24}[/math] i... 0 \n", + "4 Which fish would survive in salt water? 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('quora_train.csv')\n", + "df = df.dropna(how=\"any\").reset_index(drop=True)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEHCAYAAABSjBpvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEjNJREFUeJzt3X+s3XV9x/Hny1acDhWUSliLK9EuE11EbYDofjBZoLBlxQ0y2CIdNqszkGiim2iygD9IMIuasSkLhEoxTmSoo3HV2iHOGRF6kQpU1N4gSi2BahFxRB343h/nc/VwOb3303srp3ifj+Sb8z3vz4/v5yRtX/n+OKepKiRJ6vGUcS9AkvTkYWhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSeq2eNwL2N8OO+ywWr58+biXIUlPKrfccsv3qmrJbP1+5UJj+fLlTExMjHsZkvSkkuTbPf28PCVJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqduv3Jf7niyWn/+f417Cr5S7L/7jcS9BWhBmPdNIcmSSG5LcmWR7kje2+oVJvptkW9tOHRrztiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f788JKkfdNzeeoR4M1V9SLgeODcJEe3tvdX1TFt2wTQ2s4EXgysAj6YZFGSRcAHgFOAo4GzhuZ5T5trBfAAsLbV1wIPVNULgfe3fpKkMZk1NKrq3qr6Stt/CLgTWDrDkNXA1VX1k6r6FjAJHNu2yaq6q6p+ClwNrE4S4NXAtW38BuC0obk2tP1rgRNbf0nSGOzTjfB2eehlwE2tdF6S25KsT3Joqy0F7hkatrPV9lZ/LvCDqnpkWv0xc7X2B1t/SdIYdIdGkoOBjwNvqqofApcCLwCOAe4F3jvVdcTwmkN9prmmr21dkokkE7t3757xc0iS5q4rNJI8lUFgfKSqPgFQVfdV1aNV9TPgcgaXn2BwpnDk0PBlwK4Z6t8DDkmyeFr9MXO19mcDe6avr6ouq6qVVbVyyZJZfw5ekjRHPU9PBbgCuLOq3jdUP2Ko22uAO9r+RuDM9uTTUcAK4GZgK7CiPSl1EIOb5RurqoAbgNPb+DXAdUNzrWn7pwOfa/0lSWPQ8z2NVwGvBW5Psq3V3s7g6adjGFwuuht4PUBVbU9yDfA1Bk9enVtVjwIkOQ/YDCwC1lfV9jbfW4Grk7wbuJVBSNFeP5xkksEZxpnz+KySpHmaNTSq6ouMvrewaYYxFwEXjahvGjWuqu7iF5e3hus/Bs6YbY2SpCeGPyMiSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKnbrKGR5MgkNyS5M8n2JG9s9eck2ZJkR3s9tNWT5JIkk0luS/LyobnWtP47kqwZqr8iye1tzCVJMtMxJEnj0XOm8Qjw5qp6EXA8cG6So4HzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVuVavv7RiSpDGYNTSq6t6q+krbfwi4E1gKrAY2tG4bgNPa/mrgqhr4MnBIkiOAk4EtVbWnqh4AtgCrWtuzqurGqirgqmlzjTqGJGkM9umeRpLlwMuAm4DDq+peGAQL8LzWbSlwz9Cwna02U33niDozHGP6utYlmUgysXv37n35SJKkfdAdGkkOBj4OvKmqfjhT1xG1mkO9W1VdVlUrq2rlkiVL9mWoJGkfdIVGkqcyCIyPVNUnWvm+dmmJ9np/q+8EjhwavgzYNUt92Yj6TMeQJI1Bz9NTAa4A7qyq9w01bQSmnoBaA1w3VD+7PUV1PPBgu7S0GTgpyaHtBvhJwObW9lCS49uxzp4216hjSJLGYHFHn1cBrwVuT7Kt1d4OXAxck2Qt8B3gjNa2CTgVmAQeBs4BqKo9Sd4FbG393llVe9r+G4ArgacDn24bMxxDkjQGs4ZGVX2R0fcdAE4c0b+Ac/cy13pg/Yj6BPCSEfXvjzqGJGk8/Ea4JKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSus0aGknWJ7k/yR1DtQuTfDfJtradOtT2tiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f760JKkuek507gSWDWi/v6qOqZtmwCSHA2cCby4jflgkkVJFgEfAE4BjgbOan0B3tPmWgE8AKxt9bXAA1X1QuD9rZ8kaYxmDY2q+gKwp3O+1cDVVfWTqvoWMAkc27bJqrqrqn4KXA2sThLg1cC1bfwG4LShuTa0/WuBE1t/SdKYzOeexnlJbmuXrw5ttaXAPUN9drba3urPBX5QVY9Mqz9mrtb+YOsvSRqTxXMcdynwLqDa63uB1wGjzgSK0eFUM/RnlrbHSLIOWAfw/Oc/f6Z1S5rNhc8e9wp+tVz44LhXsF/N6Uyjqu6rqker6mfA5QwuP8HgTOHIoa7LgF0z1L8HHJJk8bT6Y+Zq7c9mL5fJquqyqlpZVSuXLFkyl48kSeowp9BIcsTQ29cAU09WbQTObE8+HQWsAG4GtgIr2pNSBzG4Wb6xqgq4ATi9jV8DXDc015q2fzrwudZfkjQms16eSvJR4ATgsCQ7gQuAE5Icw+By0d3A6wGqanuSa4CvAY8A51bVo22e84DNwCJgfVVtb4d4K3B1kncDtwJXtPoVwIeTTDI4wzhz3p9WkjQvs4ZGVZ01onzFiNpU/4uAi0bUNwGbRtTv4heXt4brPwbOmG19kqQnjt8IlyR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUbdbQSLI+yf1J7hiqPSfJliQ72uuhrZ4klySZTHJbkpcPjVnT+u9Ismao/ookt7cxlyTJTMeQJI1Pz5nGlcCqabXzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVu1SzHkCSNyayhUVVfAPZMK68GNrT9DcBpQ/WrauDLwCFJjgBOBrZU1Z6qegDYAqxqbc+qqhurqoCrps016hiSpDGZ6z2Nw6vqXoD2+rxWXwrcM9RvZ6vNVN85oj7TMR4nybokE0kmdu/ePcePJEmazf6+EZ4RtZpDfZ9U1WVVtbKqVi5ZsmRfh0uSOs01NO5rl5Zor/e3+k7gyKF+y4Bds9SXjajPdAxJ0pjMNTQ2AlNPQK0Brhuqn92eojoeeLBdWtoMnJTk0HYD/CRgc2t7KMnx7amps6fNNeoYkqQxWTxbhyQfBU4ADkuyk8FTUBcD1yRZC3wHOKN13wScCkwCDwPnAFTVniTvAra2fu+sqqmb629g8ITW04FPt40ZjiFJGpNZQ6OqztpL04kj+hZw7l7mWQ+sH1GfAF4yov79UceQJI2P3wiXJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktRtXqGR5O4ktyfZlmSi1Z6TZEuSHe310FZPkkuSTCa5LcnLh+ZZ0/rvSLJmqP6KNv9kG5v5rFeSND/740zjD6vqmKpa2d6fD1xfVSuA69t7gFOAFW1bB1wKg5ABLgCOA44FLpgKmtZn3dC4VfthvZKkOfplXJ5aDWxo+xuA04bqV9XAl4FDkhwBnAxsqao9VfUAsAVY1dqeVVU3VlUBVw3NJUkag/mGRgGfTXJLknWtdnhV3QvQXp/X6kuBe4bG7my1meo7R9QfJ8m6JBNJJnbv3j3PjyRJ2pvF8xz/qqraleR5wJYkX5+h76j7ETWH+uOLVZcBlwGsXLlyZB9J0vzN60yjqna11/uBTzK4J3Ffu7REe72/dd8JHDk0fBmwa5b6shF1SdKYzDk0kvx6kmdO7QMnAXcAG4GpJ6DWANe1/Y3A2e0pquOBB9vlq83ASUkObTfATwI2t7aHkhzfnpo6e2guSdIYzOfy1OHAJ9tTsIuBf6uqzyTZClyTZC3wHeCM1n8TcCowCTwMnANQVXuSvAvY2vq9s6r2tP03AFcCTwc+3TZJ0pjMOTSq6i7gpSPq3wdOHFEv4Ny9zLUeWD+iPgG8ZK5rlCTtX34jXJLUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0O+NBIsirJN5JMJjl/3OuRpIXsgA6NJIuADwCnAEcDZyU5eryrkqSF64AODeBYYLKq7qqqnwJXA6vHvCZJWrAO9NBYCtwz9H5nq0mSxmDxuBcwi4yo1eM6JeuAde3tj5J845e6qoXlMOB7417EbPKeca9AY/Ck+LPJO0b9M3ZA+s2eTgd6aOwEjhx6vwzYNb1TVV0GXPZELWohSTJRVSvHvQ5pOv9sjseBfnlqK7AiyVFJDgLOBDaOeU2StGAd0GcaVfVIkvOAzcAiYH1VbR/zsiRpwTqgQwOgqjYBm8a9jgXMy346UPlncwxS9bj7ypIkjXSg39OQJB1ADA1JUrcD/p6GnjhJfpvBN+6XMvg+zC5gY1XdOdaFSTpgeKYhAJK8lcHPtAS4mcHjzgE+6g9FSprijXABkOSbwIur6v+m1Q8CtlfVivGsTJpZknOq6kPjXsdC4ZmGpvwM+I0R9SNam3Sgese4F7CQeE9DU94EXJ9kB7/4kcjnAy8EzhvbqiQgyW17awIOfyLXstB5eUo/l+QpDH6OfimDv4w7ga1V9ehYF6YFL8l9wMnAA9ObgC9V1aizZP0SeKahn6uqnwFfHvc6pBE+BRxcVdumNyT5/BO/nIXLMw1JUjdvhEuSuhkakqRuhoYWrCRfmuf4v07yL/MYf3eSw+azliSnJTl6rmuQ9pWhoQWrql457jVMmcdaTgMMDT1hDA0tWEl+1F6PSPKFJNuS3JHk92YYc06Sbyb5b+BVQ/Urk5w+Yu4T2tyfTPK1JP/aHm0euZa2//dJbk/y1SQXt9rfJNnaah9P8owkrwT+FPjHtvYXtO0zSW5J8j/t98Sk/cZHbiX4S2BzVV2UZBHwjFGdkhzB4NvHrwAeBG4Abu2Y/1gGZwPfBj4D/Blw7V6OcQqDs4fjqurhJM9pTZ+oqstbn3cDa6vqn5NsBD5VVde2tuuBv62qHUmOAz4IvLpjjVIXQ0Ma/Djj+iRPBf5j1HcBmuOAz1fVboAkHwN+q2P+m6vqrjbmo8DvspfQAP4I+FBVPQxQVXta/SUtLA4BDmbwXyA/RpKDgVcC/55kqvy0jvVJ3bw8pQWvqr4A/D7wXeDDSc6eqfte6o/Q/j5l8C/2QTOMmenLUdlL+5XAeVX1OwzOdn5tRJ+nAD+oqmOGthfNcCxpnxkaWvCS/CZwf7v8cwXw8r10vQk4Iclz21nJGUNtdzO4bAWD/5PkqUNtxyY5qt3L+AvgizMs57PA65I8o61t6vLUM4F723H/aqj/Q62Nqvoh8K0kZ7SxSfLSGY4l7TNDQ4ITgG1JbgX+HPinUZ2q6l7gQuBG4L+Arww1Xw78QZKbGVzG+t+hthuBi4E7gG8Bn9zbQqrqM8BGYCLJNuAtrekfGITWFuDrQ0OuBv4uya1JXsAgUNYm+SqwnUGASfuNPyMi/RIlOQF4S1X9ybjXIu0PnmlIkrp5piGNkOQmHv/k0Wur6vZxrEc6UBgakqRuXp6SJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1+38qOCjM258g7gAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.groupby(\"is_duplicate\")['id'].count().plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What is the step by step guide to invest in share market in india?\n", + "What is the step by step guide to invest in share market?\n", + "\n", + "What is the story of Kohinoor (Koh-i-Noor) Diamond?\n", + "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?\n", + "\n", + "How can I increase the speed of my internet connection while using a VPN?\n", + "How can Internet speed be increased by hacking through DNS?\n", + "\n", + "Why am I mentally very lonely? How can I solve it?\n", + "Find the remainder when [math]23^{24}[/math] is divided by 24,23?\n", + "\n", + "Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?\n", + "Which fish would survive in salt water?\n", + "\n", + "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?\n", + "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?\n", + "\n", + "Should I buy tiago?\n", + "What keeps childern active and far from phone and video games?\n", + "\n", + "How can I be a good geologist?\n", + "What should I do to be a great geologist?\n", + "\n", + "When do you use シ instead of し?\n", + "When do you use \"&\" instead of \"and\"?\n", + "\n", + "Motorola (company): Can I hack my Charter Motorolla DCX3400?\n", + "How do I hack Motorola DCX3400 for free internet?\n", + "\n" + ] + } + ], + "source": [ + "a = 0 \n", + "for i in range(a,a+10):\n", + " print(df.question1[i])\n", + " print(df.question2[i])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "SPECIAL_TOKENS = {\n", + " 'quoted': 'quoted_item',\n", + " 'non-ascii': 'non_ascii_word',\n", + " 'undefined': 'something'\n", + "}\n", + "\n", + "def clean(text, stem_words=True):\n", + " import re\n", + " from string import punctuation\n", + " from nltk.stem import SnowballStemmer\n", + " from nltk.corpus import stopwords\n", + " \n", + " def pad_str(s):\n", + " return ' '+s+' '\n", + " \n", + " if pd.isnull(text):\n", + " return ''\n", + "\n", + "# stops = set(stopwords.words(\"english\"))\n", + " # Clean the text, with the option to stem words.\n", + " \n", + " # Empty question\n", + " \n", + " if type(text) != str or text=='':\n", + " return ''\n", + "\n", + " # Clean the text\n", + " text = re.sub(\"\\'s\", \" \", text) # we have cases like \"Sam is\" or \"Sam's\" (i.e. his) these two cases aren't separable, I choose to compromise are kill \"'s\" directly\n", + " text = re.sub(\" whats \", \" what is \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"\\'ve\", \" have \", text)\n", + " text = re.sub(\"can't\", \"can not\", text)\n", + " text = re.sub(\"n't\", \" not \", text)\n", + " text = re.sub(\"i'm\", \"i am\", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"\\'re\", \" are \", text)\n", + " text = re.sub(\"\\'d\", \" would \", text)\n", + " text = re.sub(\"\\'ll\", \" will \", text)\n", + " text = re.sub(\"e\\.g\\.\", \" eg \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"b\\.g\\.\", \" bg \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"(\\d+)(kK)\", \" \\g<1>000 \", text)\n", + " text = re.sub(\"e-mail\", \" email \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"(the[\\s]+|The[\\s]+)?U\\.S\\.A\\.\", \" America \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"(the[\\s]+|The[\\s]+)?United State(s)?\", \" America \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"\\(s\\)\", \" \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"[c-fC-F]\\:\\/\", \" disk \", text)\n", + " \n", + " # remove comma between numbers, i.e. 15,000 -> 15000\n", + " \n", + " text = re.sub('(?<=[0-9])\\,(?=[0-9])', \"\", text)\n", + " \n", + "# # all numbers should separate from words, this is too aggressive\n", + " \n", + "# def pad_number(pattern):\n", + "# matched_string = pattern.group(0)\n", + "# return pad_str(matched_string)\n", + "# text = re.sub('[0-9]+', pad_number, text)\n", + " \n", + " # add padding to punctuations and special chars, we still need them later\n", + " \n", + " text = re.sub('\\$', \" dollar \", text)\n", + " text = re.sub('\\%', \" percent \", text)\n", + " text = re.sub('\\&', \" and \", text)\n", + " \n", + "# def pad_pattern(pattern):\n", + "# matched_string = pattern.group(0)\n", + "# return pad_str(matched_string)\n", + "# text = re.sub('[\\!\\?\\@\\^\\+\\*\\/\\,\\~\\|\\`\\=\\:\\;\\.\\#\\\\\\]', pad_pattern, text) \n", + " \n", + " text = re.sub('[^\\x00-\\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word\n", + " \n", + " # indian dollar\n", + " \n", + " text = re.sub(\"(?<=[0-9])rs \", \" rs \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\" rs(?=[0-9])\", \" rs \", text, flags=re.IGNORECASE)\n", + " \n", + " # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text\n", + " text = re.sub(r\" (the[\\s]+|The[\\s]+)?US(A)? \", \" America \", text)\n", + " text = re.sub(r\" UK \", \" England \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" india \", \" India \", text)\n", + " text = re.sub(r\" switzerland \", \" Switzerland \", text)\n", + " text = re.sub(r\" china \", \" China \", text)\n", + " text = re.sub(r\" chinese \", \" Chinese \", text) \n", + " text = re.sub(r\" imrovement \", \" improvement \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" intially \", \" initially \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" quora \", \" Quora \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" dms \", \" direct messages \", text, flags=re.IGNORECASE) \n", + " text = re.sub(r\" demonitization \", \" demonetization \", text, flags=re.IGNORECASE) \n", + " text = re.sub(r\" actived \", \" active \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" kms \", \" kilometers \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" cs \", \" computer science \", text, flags=re.IGNORECASE) \n", + " text = re.sub(r\" upvote\", \" up vote\", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" iPhone \", \" phone \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" \\0rs \", \" rs \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" calender \", \" calendar \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" ios \", \" operating system \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" gps \", \" GPS \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" gst \", \" GST \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" programing \", \" programming \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" bestfriend \", \" best friend \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" dna \", \" DNA \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" III \", \" 3 \", text)\n", + " text = re.sub(r\" banglore \", \" Banglore \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" J K \", \" JK \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" J\\.K\\. \", \" JK \", text, flags=re.IGNORECASE)\n", + " \n", + " # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word \"number\"\n", + " \n", + " text = re.sub('[0-9]+\\.[0-9]+', \" 87 \", text)\n", + " \n", + " \n", + " # Remove punctuation from text\n", + " text = ''.join([c for c in text if c not in punctuation]).lower()\n", + " # Return a list of words\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df['question1'] = df['question1'].apply(clean)\n", + "df['question2'] = df['question2'].apply(clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "what is the step by step guide to invest in share market in india\n", + "what is the step by step guide to invest in share market\n", + "\n", + "what is the story of kohinoor kohinoor diamond\n", + "what would happen if the indian government stole the kohinoor kohinoor diamond back\n", + "\n", + "how can i increase the speed of my internet connection while using a vpn\n", + "how can internet speed be increased by hacking through dns\n", + "\n", + "why am i mentally very lonely how can i solve it\n", + "find the remainder when math2324math is divided by 2423\n", + "\n", + "which one dissolve in water quikly sugar salt methane and carbon di oxide\n", + "which fish would survive in salt water\n", + "\n", + "astrology i am a capricorn sun cap moon and cap risingwhat does that say about me\n", + "i am a triple capricorn sun moon and ascendant in capricorn what does this say about me\n", + "\n", + "should i buy tiago\n", + "what keeps childern active and far from phone and video games\n", + "\n", + "how can i be a good geologist\n", + "what should i do to be a great geologist\n", + "\n", + "when do you use nonasciiword instead of nonasciiword \n", + "when do you use and instead of and\n", + "\n", + "motorola company can i hack my charter motorolla dcx3400\n", + "how do i hack motorola dcx3400 for free internet\n", + "\n" + ] + } + ], + "source": [ + "a = 0 \n", + "for i in range(a,a+10):\n", + " print(df.question1[i])\n", + " print(df.question2[i])\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BOW + Xgboost Model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "count_vect = CountVectorizer(analyzer='word', token_pattern=r'\\w{1,}')\n", + "count_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n", + "trainq1_trans = count_vect.transform(df['question1'].values)\n", + "trainq2_trans = count_vect.transform(df['question2'].values)\n", + "labels = df['is_duplicate'].values\n", + "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", + "y = labels\n", + "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n", + "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", + "xgb_prediction = xgb_model.predict(X_valid)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training score: 0.6177597850983563\n", + "validation score: 0.6154032008574583\n", + " precision recall f1-score support\n", + "\n", + " 0 0.70 0.95 0.80 84267\n", + " 1 0.77 0.30 0.43 49148\n", + "\n", + " micro avg 0.71 0.71 0.71 133415\n", + " macro avg 0.73 0.62 0.62 133415\n", + "weighted avg 0.72 0.71 0.66 133415\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import f1_score, classification_report, accuracy_score\n", + "\n", + "print('training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", + "print('validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", + "print(classification_report(y_valid, xgb_prediction))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word level TF-IDF" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', max_features=5000)\n", + "tfidf_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n", + "trainq1_trans = tfidf_vect.transform(df['question1'].values)\n", + "trainq2_trans = tfidf_vect.transform(df['question2'].values)\n", + "labels = df['is_duplicate'].values\n", + "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", + "y = labels\n", + "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "word level tf-idf training score: 0.8493408114951853\n", + "word level tf-idf validation score: 0.7576508867065961\n", + " precision recall f1-score support\n", + "\n", + " 0 0.79 0.90 0.84 84267\n", + " 1 0.77 0.60 0.67 49148\n", + "\n", + " micro avg 0.79 0.79 0.79 133415\n", + " macro avg 0.78 0.75 0.76 133415\n", + "weighted avg 0.79 0.79 0.78 133415\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import f1_score, classification_report, accuracy_score\n", + "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", + "xgb_prediction = xgb_model.predict(X_valid)\n", + "print('word level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", + "print('word level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", + "print(classification_report(y_valid, xgb_prediction))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### N-gram Level TF-IDF" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n", + "tfidf_vect_ngram.fit(pd.concat((df['question1'],df['question2'])).unique())\n", + "trainq1_trans = tfidf_vect_ngram.transform(df['question1'].values)\n", + "trainq2_trans = tfidf_vect_ngram.transform(df['question2'].values)\n", + "labels = df['is_duplicate'].values\n", + "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", + "y = labels\n", + "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n-gram level tf-idf training score: 0.7193864239031045\n", + "n-gram level tf-idf validation score: 0.67470696099733\n", + " precision recall f1-score support\n", + "\n", + " 0 0.73 0.92 0.81 84267\n", + " 1 0.75 0.42 0.54 49148\n", + "\n", + " micro avg 0.73 0.73 0.73 133415\n", + " macro avg 0.74 0.67 0.67 133415\n", + "weighted avg 0.74 0.73 0.71 133415\n", + "\n" + ] + } + ], + "source": [ + "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", + "xgb_prediction = xgb_model.predict(X_valid)\n", + "print('n-gram level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", + "print('n-gram level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", + "print(classification_report(y_valid, xgb_prediction))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Character Level TF-IDF " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "character level tf-idf training score: 0.9844717869102682\n", + "character level tf-idf validation score: 0.8008380950798113\n", + " precision recall f1-score support\n", + "\n", + " 0 0.83 0.91 0.87 84267\n", + " 1 0.81 0.67 0.74 49148\n", + "\n", + " micro avg 0.82 0.82 0.82 133415\n", + " macro avg 0.82 0.79 0.80 133415\n", + "weighted avg 0.82 0.82 0.82 133415\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import f1_score, classification_report, accuracy_score\n", + "tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n", + "tfidf_vect_ngram_chars.fit(pd.concat((df['question1'],df['question2'])).unique())\n", + "trainq1_trans = tfidf_vect_ngram_chars.transform(df['question1'].values)\n", + "trainq2_trans = tfidf_vect_ngram_chars.transform(df['question2'].values)\n", + "labels = df['is_duplicate'].values\n", + "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", + "y = labels\n", + "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n", + "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", + "xgb_prediction = xgb_model.predict(X_valid)\n", + "print('character level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", + "print('character level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", + "print(classification_report(y_valid, xgb_prediction))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}