diff --git a/BOW_TFIDF_Xgboost_update.ipynb b/BOW_TFIDF_Xgboost_update.ipynb
new file mode 100644
index 0000000..8c5d2dc
--- /dev/null
+++ b/BOW_TFIDF_Xgboost_update.ipynb
@@ -0,0 +1,647 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
+ "from sklearn import linear_model\n",
+ "import numpy as np\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "import scipy\n",
+ "from sklearn.metrics import log_loss\n",
+ "import xgboost as xgb\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " qid1 | \n",
+ " qid2 | \n",
+ " question1 | \n",
+ " question2 | \n",
+ " is_duplicate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " What is the step by step guide to invest in sh... | \n",
+ " What is the step by step guide to invest in sh... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " What is the story of Kohinoor (Koh-i-Noor) Dia... | \n",
+ " What would happen if the Indian government sto... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " How can I increase the speed of my internet co... | \n",
+ " How can Internet speed be increased by hacking... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " Why am I mentally very lonely? How can I solve... | \n",
+ " Find the remainder when [math]23^{24}[/math] i... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " Which one dissolve in water quikly sugar, salt... | \n",
+ " Which fish would survive in salt water? | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id qid1 qid2 question1 \\\n",
+ "0 0 1 2 What is the step by step guide to invest in sh... \n",
+ "1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n",
+ "2 2 5 6 How can I increase the speed of my internet co... \n",
+ "3 3 7 8 Why am I mentally very lonely? How can I solve... \n",
+ "4 4 9 10 Which one dissolve in water quikly sugar, salt... \n",
+ "\n",
+ " question2 is_duplicate \n",
+ "0 What is the step by step guide to invest in sh... 0 \n",
+ "1 What would happen if the Indian government sto... 0 \n",
+ "2 How can Internet speed be increased by hacking... 0 \n",
+ "3 Find the remainder when [math]23^{24}[/math] i... 0 \n",
+ "4 Which fish would survive in salt water? 0 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv('quora_train.csv')\n",
+ "df = df.dropna(how=\"any\").reset_index(drop=True)\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEHCAYAAABSjBpvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEjNJREFUeJzt3X+s3XV9x/Hny1acDhWUSliLK9EuE11EbYDofjBZoLBlxQ0y2CIdNqszkGiim2iygD9IMIuasSkLhEoxTmSoo3HV2iHOGRF6kQpU1N4gSi2BahFxRB343h/nc/VwOb3303srp3ifj+Sb8z3vz4/v5yRtX/n+OKepKiRJ6vGUcS9AkvTkYWhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSeq2eNwL2N8OO+ywWr58+biXIUlPKrfccsv3qmrJbP1+5UJj+fLlTExMjHsZkvSkkuTbPf28PCVJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqduv3Jf7niyWn/+f417Cr5S7L/7jcS9BWhBmPdNIcmSSG5LcmWR7kje2+oVJvptkW9tOHRrztiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f788JKkfdNzeeoR4M1V9SLgeODcJEe3tvdX1TFt2wTQ2s4EXgysAj6YZFGSRcAHgFOAo4GzhuZ5T5trBfAAsLbV1wIPVNULgfe3fpKkMZk1NKrq3qr6Stt/CLgTWDrDkNXA1VX1k6r6FjAJHNu2yaq6q6p+ClwNrE4S4NXAtW38BuC0obk2tP1rgRNbf0nSGOzTjfB2eehlwE2tdF6S25KsT3Joqy0F7hkatrPV9lZ/LvCDqnpkWv0xc7X2B1t/SdIYdIdGkoOBjwNvqqofApcCLwCOAe4F3jvVdcTwmkN9prmmr21dkokkE7t3757xc0iS5q4rNJI8lUFgfKSqPgFQVfdV1aNV9TPgcgaXn2BwpnDk0PBlwK4Z6t8DDkmyeFr9MXO19mcDe6avr6ouq6qVVbVyyZJZfw5ekjRHPU9PBbgCuLOq3jdUP2Ko22uAO9r+RuDM9uTTUcAK4GZgK7CiPSl1EIOb5RurqoAbgNPb+DXAdUNzrWn7pwOfa/0lSWPQ8z2NVwGvBW5Psq3V3s7g6adjGFwuuht4PUBVbU9yDfA1Bk9enVtVjwIkOQ/YDCwC1lfV9jbfW4Grk7wbuJVBSNFeP5xkksEZxpnz+KySpHmaNTSq6ouMvrewaYYxFwEXjahvGjWuqu7iF5e3hus/Bs6YbY2SpCeGPyMiSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKnbrKGR5MgkNyS5M8n2JG9s9eck2ZJkR3s9tNWT5JIkk0luS/LyobnWtP47kqwZqr8iye1tzCVJMtMxJEnj0XOm8Qjw5qp6EXA8cG6So4HzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVuVavv7RiSpDGYNTSq6t6q+krbfwi4E1gKrAY2tG4bgNPa/mrgqhr4MnBIkiOAk4EtVbWnqh4AtgCrWtuzqurGqirgqmlzjTqGJGkM9umeRpLlwMuAm4DDq+peGAQL8LzWbSlwz9Cwna02U33niDozHGP6utYlmUgysXv37n35SJKkfdAdGkkOBj4OvKmqfjhT1xG1mkO9W1VdVlUrq2rlkiVL9mWoJGkfdIVGkqcyCIyPVNUnWvm+dmmJ9np/q+8EjhwavgzYNUt92Yj6TMeQJI1Bz9NTAa4A7qyq9w01bQSmnoBaA1w3VD+7PUV1PPBgu7S0GTgpyaHtBvhJwObW9lCS49uxzp4216hjSJLGYHFHn1cBrwVuT7Kt1d4OXAxck2Qt8B3gjNa2CTgVmAQeBs4BqKo9Sd4FbG393llVe9r+G4ArgacDn24bMxxDkjQGs4ZGVX2R0fcdAE4c0b+Ac/cy13pg/Yj6BPCSEfXvjzqGJGk8/Ea4JKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSus0aGknWJ7k/yR1DtQuTfDfJtradOtT2tiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f760JKkuek507gSWDWi/v6qOqZtmwCSHA2cCby4jflgkkVJFgEfAE4BjgbOan0B3tPmWgE8AKxt9bXAA1X1QuD9rZ8kaYxmDY2q+gKwp3O+1cDVVfWTqvoWMAkc27bJqrqrqn4KXA2sThLg1cC1bfwG4LShuTa0/WuBE1t/SdKYzOeexnlJbmuXrw5ttaXAPUN9drba3urPBX5QVY9Mqz9mrtb+YOsvSRqTxXMcdynwLqDa63uB1wGjzgSK0eFUM/RnlrbHSLIOWAfw/Oc/f6Z1S5rNhc8e9wp+tVz44LhXsF/N6Uyjqu6rqker6mfA5QwuP8HgTOHIoa7LgF0z1L8HHJJk8bT6Y+Zq7c9mL5fJquqyqlpZVSuXLFkyl48kSeowp9BIcsTQ29cAU09WbQTObE8+HQWsAG4GtgIr2pNSBzG4Wb6xqgq4ATi9jV8DXDc015q2fzrwudZfkjQms16eSvJR4ATgsCQ7gQuAE5Icw+By0d3A6wGqanuSa4CvAY8A51bVo22e84DNwCJgfVVtb4d4K3B1kncDtwJXtPoVwIeTTDI4wzhz3p9WkjQvs4ZGVZ01onzFiNpU/4uAi0bUNwGbRtTv4heXt4brPwbOmG19kqQnjt8IlyR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUbdbQSLI+yf1J7hiqPSfJliQ72uuhrZ4klySZTHJbkpcPjVnT+u9Ismao/ookt7cxlyTJTMeQJI1Pz5nGlcCqabXzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVu1SzHkCSNyayhUVVfAPZMK68GNrT9DcBpQ/WrauDLwCFJjgBOBrZU1Z6qegDYAqxqbc+qqhurqoCrps016hiSpDGZ6z2Nw6vqXoD2+rxWXwrcM9RvZ6vNVN85oj7TMR4nybokE0kmdu/ePcePJEmazf6+EZ4RtZpDfZ9U1WVVtbKqVi5ZsmRfh0uSOs01NO5rl5Zor/e3+k7gyKF+y4Bds9SXjajPdAxJ0pjMNTQ2AlNPQK0Brhuqn92eojoeeLBdWtoMnJTk0HYD/CRgc2t7KMnx7amps6fNNeoYkqQxWTxbhyQfBU4ADkuyk8FTUBcD1yRZC3wHOKN13wScCkwCDwPnAFTVniTvAra2fu+sqqmb629g8ITW04FPt40ZjiFJGpNZQ6OqztpL04kj+hZw7l7mWQ+sH1GfAF4yov79UceQJI2P3wiXJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktRtXqGR5O4ktyfZlmSi1Z6TZEuSHe310FZPkkuSTCa5LcnLh+ZZ0/rvSLJmqP6KNv9kG5v5rFeSND/740zjD6vqmKpa2d6fD1xfVSuA69t7gFOAFW1bB1wKg5ABLgCOA44FLpgKmtZn3dC4VfthvZKkOfplXJ5aDWxo+xuA04bqV9XAl4FDkhwBnAxsqao9VfUAsAVY1dqeVVU3VlUBVw3NJUkag/mGRgGfTXJLknWtdnhV3QvQXp/X6kuBe4bG7my1meo7R9QfJ8m6JBNJJnbv3j3PjyRJ2pvF8xz/qqraleR5wJYkX5+h76j7ETWH+uOLVZcBlwGsXLlyZB9J0vzN60yjqna11/uBTzK4J3Ffu7REe72/dd8JHDk0fBmwa5b6shF1SdKYzDk0kvx6kmdO7QMnAXcAG4GpJ6DWANe1/Y3A2e0pquOBB9vlq83ASUkObTfATwI2t7aHkhzfnpo6e2guSdIYzOfy1OHAJ9tTsIuBf6uqzyTZClyTZC3wHeCM1n8TcCowCTwMnANQVXuSvAvY2vq9s6r2tP03AFcCTwc+3TZJ0pjMOTSq6i7gpSPq3wdOHFEv4Ny9zLUeWD+iPgG8ZK5rlCTtX34jXJLUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0O+NBIsirJN5JMJjl/3OuRpIXsgA6NJIuADwCnAEcDZyU5eryrkqSF64AODeBYYLKq7qqqnwJXA6vHvCZJWrAO9NBYCtwz9H5nq0mSxmDxuBcwi4yo1eM6JeuAde3tj5J845e6qoXlMOB7417EbPKeca9AY/Ck+LPJO0b9M3ZA+s2eTgd6aOwEjhx6vwzYNb1TVV0GXPZELWohSTJRVSvHvQ5pOv9sjseBfnlqK7AiyVFJDgLOBDaOeU2StGAd0GcaVfVIkvOAzcAiYH1VbR/zsiRpwTqgQwOgqjYBm8a9jgXMy346UPlncwxS9bj7ypIkjXSg39OQJB1ADA1JUrcD/p6GnjhJfpvBN+6XMvg+zC5gY1XdOdaFSTpgeKYhAJK8lcHPtAS4mcHjzgE+6g9FSprijXABkOSbwIur6v+m1Q8CtlfVivGsTJpZknOq6kPjXsdC4ZmGpvwM+I0R9SNam3Sgese4F7CQeE9DU94EXJ9kB7/4kcjnAy8EzhvbqiQgyW17awIOfyLXstB5eUo/l+QpDH6OfimDv4w7ga1V9ehYF6YFL8l9wMnAA9ObgC9V1aizZP0SeKahn6uqnwFfHvc6pBE+BRxcVdumNyT5/BO/nIXLMw1JUjdvhEuSuhkakqRuhoYWrCRfmuf4v07yL/MYf3eSw+azliSnJTl6rmuQ9pWhoQWrql457jVMmcdaTgMMDT1hDA0tWEl+1F6PSPKFJNuS3JHk92YYc06Sbyb5b+BVQ/Urk5w+Yu4T2tyfTPK1JP/aHm0euZa2//dJbk/y1SQXt9rfJNnaah9P8owkrwT+FPjHtvYXtO0zSW5J8j/t98Sk/cZHbiX4S2BzVV2UZBHwjFGdkhzB4NvHrwAeBG4Abu2Y/1gGZwPfBj4D/Blw7V6OcQqDs4fjqurhJM9pTZ+oqstbn3cDa6vqn5NsBD5VVde2tuuBv62qHUmOAz4IvLpjjVIXQ0Ma/Djj+iRPBf5j1HcBmuOAz1fVboAkHwN+q2P+m6vqrjbmo8DvspfQAP4I+FBVPQxQVXta/SUtLA4BDmbwXyA/RpKDgVcC/55kqvy0jvVJ3bw8pQWvqr4A/D7wXeDDSc6eqfte6o/Q/j5l8C/2QTOMmenLUdlL+5XAeVX1OwzOdn5tRJ+nAD+oqmOGthfNcCxpnxkaWvCS/CZwf7v8cwXw8r10vQk4Iclz21nJGUNtdzO4bAWD/5PkqUNtxyY5qt3L+AvgizMs57PA65I8o61t6vLUM4F723H/aqj/Q62Nqvoh8K0kZ7SxSfLSGY4l7TNDQ4ITgG1JbgX+HPinUZ2q6l7gQuBG4L+Arww1Xw78QZKbGVzG+t+hthuBi4E7gG8Bn9zbQqrqM8BGYCLJNuAtrekfGITWFuDrQ0OuBv4uya1JXsAgUNYm+SqwnUGASfuNPyMi/RIlOQF4S1X9ybjXIu0PnmlIkrp5piGNkOQmHv/k0Wur6vZxrEc6UBgakqRuXp6SJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1+38qOCjM258g7gAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df.groupby(\"is_duplicate\")['id'].count().plot.bar()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "What is the step by step guide to invest in share market in india?\n",
+ "What is the step by step guide to invest in share market?\n",
+ "\n",
+ "What is the story of Kohinoor (Koh-i-Noor) Diamond?\n",
+ "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?\n",
+ "\n",
+ "How can I increase the speed of my internet connection while using a VPN?\n",
+ "How can Internet speed be increased by hacking through DNS?\n",
+ "\n",
+ "Why am I mentally very lonely? How can I solve it?\n",
+ "Find the remainder when [math]23^{24}[/math] is divided by 24,23?\n",
+ "\n",
+ "Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?\n",
+ "Which fish would survive in salt water?\n",
+ "\n",
+ "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?\n",
+ "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?\n",
+ "\n",
+ "Should I buy tiago?\n",
+ "What keeps childern active and far from phone and video games?\n",
+ "\n",
+ "How can I be a good geologist?\n",
+ "What should I do to be a great geologist?\n",
+ "\n",
+ "When do you use シ instead of し?\n",
+ "When do you use \"&\" instead of \"and\"?\n",
+ "\n",
+ "Motorola (company): Can I hack my Charter Motorolla DCX3400?\n",
+ "How do I hack Motorola DCX3400 for free internet?\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "a = 0 \n",
+ "for i in range(a,a+10):\n",
+ " print(df.question1[i])\n",
+ " print(df.question2[i])\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "SPECIAL_TOKENS = {\n",
+ " 'quoted': 'quoted_item',\n",
+ " 'non-ascii': 'non_ascii_word',\n",
+ " 'undefined': 'something'\n",
+ "}\n",
+ "\n",
+ "def clean(text, stem_words=True):\n",
+ " import re\n",
+ " from string import punctuation\n",
+ " from nltk.stem import SnowballStemmer\n",
+ " from nltk.corpus import stopwords\n",
+ " \n",
+ " def pad_str(s):\n",
+ " return ' '+s+' '\n",
+ " \n",
+ " if pd.isnull(text):\n",
+ " return ''\n",
+ "\n",
+ "# stops = set(stopwords.words(\"english\"))\n",
+ " # Clean the text, with the option to stem words.\n",
+ " \n",
+ " # Empty question\n",
+ " \n",
+ " if type(text) != str or text=='':\n",
+ " return ''\n",
+ "\n",
+ " # Clean the text\n",
+ " text = re.sub(\"\\'s\", \" \", text) # we have cases like \"Sam is\" or \"Sam's\" (i.e. his) these two cases aren't separable, I choose to compromise are kill \"'s\" directly\n",
+ " text = re.sub(\" whats \", \" what is \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"\\'ve\", \" have \", text)\n",
+ " text = re.sub(\"can't\", \"can not\", text)\n",
+ " text = re.sub(\"n't\", \" not \", text)\n",
+ " text = re.sub(\"i'm\", \"i am\", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"\\'re\", \" are \", text)\n",
+ " text = re.sub(\"\\'d\", \" would \", text)\n",
+ " text = re.sub(\"\\'ll\", \" will \", text)\n",
+ " text = re.sub(\"e\\.g\\.\", \" eg \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"b\\.g\\.\", \" bg \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"(\\d+)(kK)\", \" \\g<1>000 \", text)\n",
+ " text = re.sub(\"e-mail\", \" email \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"(the[\\s]+|The[\\s]+)?U\\.S\\.A\\.\", \" America \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"(the[\\s]+|The[\\s]+)?United State(s)?\", \" America \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"\\(s\\)\", \" \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\"[c-fC-F]\\:\\/\", \" disk \", text)\n",
+ " \n",
+ " # remove comma between numbers, i.e. 15,000 -> 15000\n",
+ " \n",
+ " text = re.sub('(?<=[0-9])\\,(?=[0-9])', \"\", text)\n",
+ " \n",
+ "# # all numbers should separate from words, this is too aggressive\n",
+ " \n",
+ "# def pad_number(pattern):\n",
+ "# matched_string = pattern.group(0)\n",
+ "# return pad_str(matched_string)\n",
+ "# text = re.sub('[0-9]+', pad_number, text)\n",
+ " \n",
+ " # add padding to punctuations and special chars, we still need them later\n",
+ " \n",
+ " text = re.sub('\\$', \" dollar \", text)\n",
+ " text = re.sub('\\%', \" percent \", text)\n",
+ " text = re.sub('\\&', \" and \", text)\n",
+ " \n",
+ "# def pad_pattern(pattern):\n",
+ "# matched_string = pattern.group(0)\n",
+ "# return pad_str(matched_string)\n",
+ "# text = re.sub('[\\!\\?\\@\\^\\+\\*\\/\\,\\~\\|\\`\\=\\:\\;\\.\\#\\\\\\]', pad_pattern, text) \n",
+ " \n",
+ " text = re.sub('[^\\x00-\\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word\n",
+ " \n",
+ " # indian dollar\n",
+ " \n",
+ " text = re.sub(\"(?<=[0-9])rs \", \" rs \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(\" rs(?=[0-9])\", \" rs \", text, flags=re.IGNORECASE)\n",
+ " \n",
+ " # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text\n",
+ " text = re.sub(r\" (the[\\s]+|The[\\s]+)?US(A)? \", \" America \", text)\n",
+ " text = re.sub(r\" UK \", \" England \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" india \", \" India \", text)\n",
+ " text = re.sub(r\" switzerland \", \" Switzerland \", text)\n",
+ " text = re.sub(r\" china \", \" China \", text)\n",
+ " text = re.sub(r\" chinese \", \" Chinese \", text) \n",
+ " text = re.sub(r\" imrovement \", \" improvement \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" intially \", \" initially \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" quora \", \" Quora \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" dms \", \" direct messages \", text, flags=re.IGNORECASE) \n",
+ " text = re.sub(r\" demonitization \", \" demonetization \", text, flags=re.IGNORECASE) \n",
+ " text = re.sub(r\" actived \", \" active \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" kms \", \" kilometers \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" cs \", \" computer science \", text, flags=re.IGNORECASE) \n",
+ " text = re.sub(r\" upvote\", \" up vote\", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" iPhone \", \" phone \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" \\0rs \", \" rs \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" calender \", \" calendar \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" ios \", \" operating system \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" gps \", \" GPS \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" gst \", \" GST \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" programing \", \" programming \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" bestfriend \", \" best friend \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" dna \", \" DNA \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" III \", \" 3 \", text)\n",
+ " text = re.sub(r\" banglore \", \" Banglore \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" J K \", \" JK \", text, flags=re.IGNORECASE)\n",
+ " text = re.sub(r\" J\\.K\\. \", \" JK \", text, flags=re.IGNORECASE)\n",
+ " \n",
+ " # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word \"number\"\n",
+ " \n",
+ " text = re.sub('[0-9]+\\.[0-9]+', \" 87 \", text)\n",
+ " \n",
+ " \n",
+ " # Remove punctuation from text\n",
+ " text = ''.join([c for c in text if c not in punctuation]).lower()\n",
+ " # Return a list of words\n",
+ " return text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['question1'] = df['question1'].apply(clean)\n",
+ "df['question2'] = df['question2'].apply(clean)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "what is the step by step guide to invest in share market in india\n",
+ "what is the step by step guide to invest in share market\n",
+ "\n",
+ "what is the story of kohinoor kohinoor diamond\n",
+ "what would happen if the indian government stole the kohinoor kohinoor diamond back\n",
+ "\n",
+ "how can i increase the speed of my internet connection while using a vpn\n",
+ "how can internet speed be increased by hacking through dns\n",
+ "\n",
+ "why am i mentally very lonely how can i solve it\n",
+ "find the remainder when math2324math is divided by 2423\n",
+ "\n",
+ "which one dissolve in water quikly sugar salt methane and carbon di oxide\n",
+ "which fish would survive in salt water\n",
+ "\n",
+ "astrology i am a capricorn sun cap moon and cap risingwhat does that say about me\n",
+ "i am a triple capricorn sun moon and ascendant in capricorn what does this say about me\n",
+ "\n",
+ "should i buy tiago\n",
+ "what keeps childern active and far from phone and video games\n",
+ "\n",
+ "how can i be a good geologist\n",
+ "what should i do to be a great geologist\n",
+ "\n",
+ "when do you use nonasciiword instead of nonasciiword \n",
+ "when do you use and instead of and\n",
+ "\n",
+ "motorola company can i hack my charter motorolla dcx3400\n",
+ "how do i hack motorola dcx3400 for free internet\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "a = 0 \n",
+ "for i in range(a,a+10):\n",
+ " print(df.question1[i])\n",
+ " print(df.question2[i])\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### BOW + Xgboost Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "count_vect = CountVectorizer(analyzer='word', token_pattern=r'\\w{1,}')\n",
+ "count_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+ "trainq1_trans = count_vect.transform(df['question1'].values)\n",
+ "trainq2_trans = count_vect.transform(df['question2'].values)\n",
+ "labels = df['is_duplicate'].values\n",
+ "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+ "y = labels\n",
+ "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n",
+ "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+ "xgb_prediction = xgb_model.predict(X_valid)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "training score: 0.6177597850983563\n",
+ "validation score: 0.6154032008574583\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.70 0.95 0.80 84267\n",
+ " 1 0.77 0.30 0.43 49148\n",
+ "\n",
+ " micro avg 0.71 0.71 0.71 133415\n",
+ " macro avg 0.73 0.62 0.62 133415\n",
+ "weighted avg 0.72 0.71 0.66 133415\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
+ "\n",
+ "print('training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+ "print('validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+ "print(classification_report(y_valid, xgb_prediction))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Word level TF-IDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', max_features=5000)\n",
+ "tfidf_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+ "trainq1_trans = tfidf_vect.transform(df['question1'].values)\n",
+ "trainq2_trans = tfidf_vect.transform(df['question2'].values)\n",
+ "labels = df['is_duplicate'].values\n",
+ "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+ "y = labels\n",
+ "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "word level tf-idf training score: 0.8493408114951853\n",
+ "word level tf-idf validation score: 0.7576508867065961\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.79 0.90 0.84 84267\n",
+ " 1 0.77 0.60 0.67 49148\n",
+ "\n",
+ " micro avg 0.79 0.79 0.79 133415\n",
+ " macro avg 0.78 0.75 0.76 133415\n",
+ "weighted avg 0.79 0.79 0.78 133415\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
+ "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+ "xgb_prediction = xgb_model.predict(X_valid)\n",
+ "print('word level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+ "print('word level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+ "print(classification_report(y_valid, xgb_prediction))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### N-gram Level TF-IDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n",
+ "tfidf_vect_ngram.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+ "trainq1_trans = tfidf_vect_ngram.transform(df['question1'].values)\n",
+ "trainq2_trans = tfidf_vect_ngram.transform(df['question2'].values)\n",
+ "labels = df['is_duplicate'].values\n",
+ "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+ "y = labels\n",
+ "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n-gram level tf-idf training score: 0.7193864239031045\n",
+ "n-gram level tf-idf validation score: 0.67470696099733\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.73 0.92 0.81 84267\n",
+ " 1 0.75 0.42 0.54 49148\n",
+ "\n",
+ " micro avg 0.73 0.73 0.73 133415\n",
+ " macro avg 0.74 0.67 0.67 133415\n",
+ "weighted avg 0.74 0.73 0.71 133415\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+ "xgb_prediction = xgb_model.predict(X_valid)\n",
+ "print('n-gram level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+ "print('n-gram level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+ "print(classification_report(y_valid, xgb_prediction))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Character Level TF-IDF "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "character level tf-idf training score: 0.9844717869102682\n",
+ "character level tf-idf validation score: 0.8008380950798113\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.83 0.91 0.87 84267\n",
+ " 1 0.81 0.67 0.74 49148\n",
+ "\n",
+ " micro avg 0.82 0.82 0.82 133415\n",
+ " macro avg 0.82 0.79 0.80 133415\n",
+ "weighted avg 0.82 0.82 0.82 133415\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
+ "tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n",
+ "tfidf_vect_ngram_chars.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+ "trainq1_trans = tfidf_vect_ngram_chars.transform(df['question1'].values)\n",
+ "trainq2_trans = tfidf_vect_ngram_chars.transform(df['question2'].values)\n",
+ "labels = df['is_duplicate'].values\n",
+ "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+ "y = labels\n",
+ "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n",
+ "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+ "xgb_prediction = xgb_model.predict(X_valid)\n",
+ "print('character level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+ "print('character level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+ "print(classification_report(y_valid, xgb_prediction))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}