From 89f084ddcebb6f41a1c43a1495b448ee357d6e68 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Sun, 21 Oct 2018 21:03:43 -0400 Subject: [PATCH] Delete BOW_TFIDF_Xgboost.ipynb --- BOW_TFIDF_Xgboost.ipynb | 647 ---------------------------------------- 1 file changed, 647 deletions(-) delete mode 100644 BOW_TFIDF_Xgboost.ipynb diff --git a/BOW_TFIDF_Xgboost.ipynb b/BOW_TFIDF_Xgboost.ipynb deleted file mode 100644 index 8c5d2dc..0000000 --- a/BOW_TFIDF_Xgboost.ipynb +++ /dev/null @@ -1,647 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", - "from sklearn import linear_model\n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "import scipy\n", - "from sklearn.metrics import log_loss\n", - "import xgboost as xgb\n", - "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import roc_auc_score\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idqid1qid2question1question2is_duplicate
0012What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...0
1134What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...0
2256How can I increase the speed of my internet co...How can Internet speed be increased by hacking...0
3378Why am I mentally very lonely? How can I solve...Find the remainder when [math]23^{24}[/math] i...0
44910Which one dissolve in water quikly sugar, salt...Which fish would survive in salt water?0
\n", - "
" - ], - "text/plain": [ - " id qid1 qid2 question1 \\\n", - "0 0 1 2 What is the step by step guide to invest in sh... \n", - "1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", - "2 2 5 6 How can I increase the speed of my internet co... \n", - "3 3 7 8 Why am I mentally very lonely? How can I solve... \n", - "4 4 9 10 Which one dissolve in water quikly sugar, salt... \n", - "\n", - " question2 is_duplicate \n", - "0 What is the step by step guide to invest in sh... 0 \n", - "1 What would happen if the Indian government sto... 0 \n", - "2 How can Internet speed be increased by hacking... 0 \n", - "3 Find the remainder when [math]23^{24}[/math] i... 0 \n", - "4 Which fish would survive in salt water? 0 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv('quora_train.csv')\n", - "df = df.dropna(how=\"any\").reset_index(drop=True)\n", - "\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEHCAYAAABSjBpvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEjNJREFUeJzt3X+s3XV9x/Hny1acDhWUSliLK9EuE11EbYDofjBZoLBlxQ0y2CIdNqszkGiim2iygD9IMIuasSkLhEoxTmSoo3HV2iHOGRF6kQpU1N4gSi2BahFxRB343h/nc/VwOb3303srp3ifj+Sb8z3vz4/v5yRtX/n+OKepKiRJ6vGUcS9AkvTkYWhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSeq2eNwL2N8OO+ywWr58+biXIUlPKrfccsv3qmrJbP1+5UJj+fLlTExMjHsZkvSkkuTbPf28PCVJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqduv3Jf7niyWn/+f417Cr5S7L/7jcS9BWhBmPdNIcmSSG5LcmWR7kje2+oVJvptkW9tOHRrztiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f788JKkfdNzeeoR4M1V9SLgeODcJEe3tvdX1TFt2wTQ2s4EXgysAj6YZFGSRcAHgFOAo4GzhuZ5T5trBfAAsLbV1wIPVNULgfe3fpKkMZk1NKrq3qr6Stt/CLgTWDrDkNXA1VX1k6r6FjAJHNu2yaq6q6p+ClwNrE4S4NXAtW38BuC0obk2tP1rgRNbf0nSGOzTjfB2eehlwE2tdF6S25KsT3Joqy0F7hkatrPV9lZ/LvCDqnpkWv0xc7X2B1t/SdIYdIdGkoOBjwNvqqofApcCLwCOAe4F3jvVdcTwmkN9prmmr21dkokkE7t3757xc0iS5q4rNJI8lUFgfKSqPgFQVfdV1aNV9TPgcgaXn2BwpnDk0PBlwK4Z6t8DDkmyeFr9MXO19mcDe6avr6ouq6qVVbVyyZJZfw5ekjRHPU9PBbgCuLOq3jdUP2Ko22uAO9r+RuDM9uTTUcAK4GZgK7CiPSl1EIOb5RurqoAbgNPb+DXAdUNzrWn7pwOfa/0lSWPQ8z2NVwGvBW5Psq3V3s7g6adjGFwuuht4PUBVbU9yDfA1Bk9enVtVjwIkOQ/YDCwC1lfV9jbfW4Grk7wbuJVBSNFeP5xkksEZxpnz+KySpHmaNTSq6ouMvrewaYYxFwEXjahvGjWuqu7iF5e3hus/Bs6YbY2SpCeGPyMiSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKnbrKGR5MgkNyS5M8n2JG9s9eck2ZJkR3s9tNWT5JIkk0luS/LyobnWtP47kqwZqr8iye1tzCVJMtMxJEnj0XOm8Qjw5qp6EXA8cG6So4HzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVuVavv7RiSpDGYNTSq6t6q+krbfwi4E1gKrAY2tG4bgNPa/mrgqhr4MnBIkiOAk4EtVbWnqh4AtgCrWtuzqurGqirgqmlzjTqGJGkM9umeRpLlwMuAm4DDq+peGAQL8LzWbSlwz9Cwna02U33niDozHGP6utYlmUgysXv37n35SJKkfdAdGkkOBj4OvKmqfjhT1xG1mkO9W1VdVlUrq2rlkiVL9mWoJGkfdIVGkqcyCIyPVNUnWvm+dmmJ9np/q+8EjhwavgzYNUt92Yj6TMeQJI1Bz9NTAa4A7qyq9w01bQSmnoBaA1w3VD+7PUV1PPBgu7S0GTgpyaHtBvhJwObW9lCS49uxzp4216hjSJLGYHFHn1cBrwVuT7Kt1d4OXAxck2Qt8B3gjNa2CTgVmAQeBs4BqKo9Sd4FbG393llVe9r+G4ArgacDn24bMxxDkjQGs4ZGVX2R0fcdAE4c0b+Ac/cy13pg/Yj6BPCSEfXvjzqGJGk8/Ea4JKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSus0aGknWJ7k/yR1DtQuTfDfJtradOtT2tiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f760JKkuek507gSWDWi/v6qOqZtmwCSHA2cCby4jflgkkVJFgEfAE4BjgbOan0B3tPmWgE8AKxt9bXAA1X1QuD9rZ8kaYxmDY2q+gKwp3O+1cDVVfWTqvoWMAkc27bJqrqrqn4KXA2sThLg1cC1bfwG4LShuTa0/WuBE1t/SdKYzOeexnlJbmuXrw5ttaXAPUN9drba3urPBX5QVY9Mqz9mrtb+YOsvSRqTxXMcdynwLqDa63uB1wGjzgSK0eFUM/RnlrbHSLIOWAfw/Oc/f6Z1S5rNhc8e9wp+tVz44LhXsF/N6Uyjqu6rqker6mfA5QwuP8HgTOHIoa7LgF0z1L8HHJJk8bT6Y+Zq7c9mL5fJquqyqlpZVSuXLFkyl48kSeowp9BIcsTQ29cAU09WbQTObE8+HQWsAG4GtgIr2pNSBzG4Wb6xqgq4ATi9jV8DXDc015q2fzrwudZfkjQms16eSvJR4ATgsCQ7gQuAE5Icw+By0d3A6wGqanuSa4CvAY8A51bVo22e84DNwCJgfVVtb4d4K3B1kncDtwJXtPoVwIeTTDI4wzhz3p9WkjQvs4ZGVZ01onzFiNpU/4uAi0bUNwGbRtTv4heXt4brPwbOmG19kqQnjt8IlyR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUbdbQSLI+yf1J7hiqPSfJliQ72uuhrZ4klySZTHJbkpcPjVnT+u9Ismao/ookt7cxlyTJTMeQJI1Pz5nGlcCqabXzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVu1SzHkCSNyayhUVVfAPZMK68GNrT9DcBpQ/WrauDLwCFJjgBOBrZU1Z6qegDYAqxqbc+qqhurqoCrps016hiSpDGZ6z2Nw6vqXoD2+rxWXwrcM9RvZ6vNVN85oj7TMR4nybokE0kmdu/ePcePJEmazf6+EZ4RtZpDfZ9U1WVVtbKqVi5ZsmRfh0uSOs01NO5rl5Zor/e3+k7gyKF+y4Bds9SXjajPdAxJ0pjMNTQ2AlNPQK0Brhuqn92eojoeeLBdWtoMnJTk0HYD/CRgc2t7KMnx7amps6fNNeoYkqQxWTxbhyQfBU4ADkuyk8FTUBcD1yRZC3wHOKN13wScCkwCDwPnAFTVniTvAra2fu+sqqmb629g8ITW04FPt40ZjiFJGpNZQ6OqztpL04kj+hZw7l7mWQ+sH1GfAF4yov79UceQJI2P3wiXJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktRtXqGR5O4ktyfZlmSi1Z6TZEuSHe310FZPkkuSTCa5LcnLh+ZZ0/rvSLJmqP6KNv9kG5v5rFeSND/740zjD6vqmKpa2d6fD1xfVSuA69t7gFOAFW1bB1wKg5ABLgCOA44FLpgKmtZn3dC4VfthvZKkOfplXJ5aDWxo+xuA04bqV9XAl4FDkhwBnAxsqao9VfUAsAVY1dqeVVU3VlUBVw3NJUkag/mGRgGfTXJLknWtdnhV3QvQXp/X6kuBe4bG7my1meo7R9QfJ8m6JBNJJnbv3j3PjyRJ2pvF8xz/qqraleR5wJYkX5+h76j7ETWH+uOLVZcBlwGsXLlyZB9J0vzN60yjqna11/uBTzK4J3Ffu7REe72/dd8JHDk0fBmwa5b6shF1SdKYzDk0kvx6kmdO7QMnAXcAG4GpJ6DWANe1/Y3A2e0pquOBB9vlq83ASUkObTfATwI2t7aHkhzfnpo6e2guSdIYzOfy1OHAJ9tTsIuBf6uqzyTZClyTZC3wHeCM1n8TcCowCTwMnANQVXuSvAvY2vq9s6r2tP03AFcCTwc+3TZJ0pjMOTSq6i7gpSPq3wdOHFEv4Ny9zLUeWD+iPgG8ZK5rlCTtX34jXJLUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0O+NBIsirJN5JMJjl/3OuRpIXsgA6NJIuADwCnAEcDZyU5eryrkqSF64AODeBYYLKq7qqqnwJXA6vHvCZJWrAO9NBYCtwz9H5nq0mSxmDxuBcwi4yo1eM6JeuAde3tj5J845e6qoXlMOB7417EbPKeca9AY/Ck+LPJO0b9M3ZA+s2eTgd6aOwEjhx6vwzYNb1TVV0GXPZELWohSTJRVSvHvQ5pOv9sjseBfnlqK7AiyVFJDgLOBDaOeU2StGAd0GcaVfVIkvOAzcAiYH1VbR/zsiRpwTqgQwOgqjYBm8a9jgXMy346UPlncwxS9bj7ypIkjXSg39OQJB1ADA1JUrcD/p6GnjhJfpvBN+6XMvg+zC5gY1XdOdaFSTpgeKYhAJK8lcHPtAS4mcHjzgE+6g9FSprijXABkOSbwIur6v+m1Q8CtlfVivGsTJpZknOq6kPjXsdC4ZmGpvwM+I0R9SNam3Sgese4F7CQeE9DU94EXJ9kB7/4kcjnAy8EzhvbqiQgyW17awIOfyLXstB5eUo/l+QpDH6OfimDv4w7ga1V9ehYF6YFL8l9wMnAA9ObgC9V1aizZP0SeKahn6uqnwFfHvc6pBE+BRxcVdumNyT5/BO/nIXLMw1JUjdvhEuSuhkakqRuhoYWrCRfmuf4v07yL/MYf3eSw+azliSnJTl6rmuQ9pWhoQWrql457jVMmcdaTgMMDT1hDA0tWEl+1F6PSPKFJNuS3JHk92YYc06Sbyb5b+BVQ/Urk5w+Yu4T2tyfTPK1JP/aHm0euZa2//dJbk/y1SQXt9rfJNnaah9P8owkrwT+FPjHtvYXtO0zSW5J8j/t98Sk/cZHbiX4S2BzVV2UZBHwjFGdkhzB4NvHrwAeBG4Abu2Y/1gGZwPfBj4D/Blw7V6OcQqDs4fjqurhJM9pTZ+oqstbn3cDa6vqn5NsBD5VVde2tuuBv62qHUmOAz4IvLpjjVIXQ0Ma/Djj+iRPBf5j1HcBmuOAz1fVboAkHwN+q2P+m6vqrjbmo8DvspfQAP4I+FBVPQxQVXta/SUtLA4BDmbwXyA/RpKDgVcC/55kqvy0jvVJ3bw8pQWvqr4A/D7wXeDDSc6eqfte6o/Q/j5l8C/2QTOMmenLUdlL+5XAeVX1OwzOdn5tRJ+nAD+oqmOGthfNcCxpnxkaWvCS/CZwf7v8cwXw8r10vQk4Iclz21nJGUNtdzO4bAWD/5PkqUNtxyY5qt3L+AvgizMs57PA65I8o61t6vLUM4F723H/aqj/Q62Nqvoh8K0kZ7SxSfLSGY4l7TNDQ4ITgG1JbgX+HPinUZ2q6l7gQuBG4L+Arww1Xw78QZKbGVzG+t+hthuBi4E7gG8Bn9zbQqrqM8BGYCLJNuAtrekfGITWFuDrQ0OuBv4uya1JXsAgUNYm+SqwnUGASfuNPyMi/RIlOQF4S1X9ybjXIu0PnmlIkrp5piGNkOQmHv/k0Wur6vZxrEc6UBgakqRuXp6SJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1+38qOCjM258g7gAAAABJRU5ErkJggg==\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.groupby(\"is_duplicate\")['id'].count().plot.bar()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "What is the step by step guide to invest in share market in india?\n", - "What is the step by step guide to invest in share market?\n", - "\n", - "What is the story of Kohinoor (Koh-i-Noor) Diamond?\n", - "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?\n", - "\n", - "How can I increase the speed of my internet connection while using a VPN?\n", - "How can Internet speed be increased by hacking through DNS?\n", - "\n", - "Why am I mentally very lonely? How can I solve it?\n", - "Find the remainder when [math]23^{24}[/math] is divided by 24,23?\n", - "\n", - "Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?\n", - "Which fish would survive in salt water?\n", - "\n", - "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?\n", - "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?\n", - "\n", - "Should I buy tiago?\n", - "What keeps childern active and far from phone and video games?\n", - "\n", - "How can I be a good geologist?\n", - "What should I do to be a great geologist?\n", - "\n", - "When do you use シ instead of し?\n", - "When do you use \"&\" instead of \"and\"?\n", - "\n", - "Motorola (company): Can I hack my Charter Motorolla DCX3400?\n", - "How do I hack Motorola DCX3400 for free internet?\n", - "\n" - ] - } - ], - "source": [ - "a = 0 \n", - "for i in range(a,a+10):\n", - " print(df.question1[i])\n", - " print(df.question2[i])\n", - " print()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "SPECIAL_TOKENS = {\n", - " 'quoted': 'quoted_item',\n", - " 'non-ascii': 'non_ascii_word',\n", - " 'undefined': 'something'\n", - "}\n", - "\n", - "def clean(text, stem_words=True):\n", - " import re\n", - " from string import punctuation\n", - " from nltk.stem import SnowballStemmer\n", - " from nltk.corpus import stopwords\n", - " \n", - " def pad_str(s):\n", - " return ' '+s+' '\n", - " \n", - " if pd.isnull(text):\n", - " return ''\n", - "\n", - "# stops = set(stopwords.words(\"english\"))\n", - " # Clean the text, with the option to stem words.\n", - " \n", - " # Empty question\n", - " \n", - " if type(text) != str or text=='':\n", - " return ''\n", - "\n", - " # Clean the text\n", - " text = re.sub(\"\\'s\", \" \", text) # we have cases like \"Sam is\" or \"Sam's\" (i.e. his) these two cases aren't separable, I choose to compromise are kill \"'s\" directly\n", - " text = re.sub(\" whats \", \" what is \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"\\'ve\", \" have \", text)\n", - " text = re.sub(\"can't\", \"can not\", text)\n", - " text = re.sub(\"n't\", \" not \", text)\n", - " text = re.sub(\"i'm\", \"i am\", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"\\'re\", \" are \", text)\n", - " text = re.sub(\"\\'d\", \" would \", text)\n", - " text = re.sub(\"\\'ll\", \" will \", text)\n", - " text = re.sub(\"e\\.g\\.\", \" eg \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"b\\.g\\.\", \" bg \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"(\\d+)(kK)\", \" \\g<1>000 \", text)\n", - " text = re.sub(\"e-mail\", \" email \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"(the[\\s]+|The[\\s]+)?U\\.S\\.A\\.\", \" America \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"(the[\\s]+|The[\\s]+)?United State(s)?\", \" America \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"\\(s\\)\", \" \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\"[c-fC-F]\\:\\/\", \" disk \", text)\n", - " \n", - " # remove comma between numbers, i.e. 15,000 -> 15000\n", - " \n", - " text = re.sub('(?<=[0-9])\\,(?=[0-9])', \"\", text)\n", - " \n", - "# # all numbers should separate from words, this is too aggressive\n", - " \n", - "# def pad_number(pattern):\n", - "# matched_string = pattern.group(0)\n", - "# return pad_str(matched_string)\n", - "# text = re.sub('[0-9]+', pad_number, text)\n", - " \n", - " # add padding to punctuations and special chars, we still need them later\n", - " \n", - " text = re.sub('\\$', \" dollar \", text)\n", - " text = re.sub('\\%', \" percent \", text)\n", - " text = re.sub('\\&', \" and \", text)\n", - " \n", - "# def pad_pattern(pattern):\n", - "# matched_string = pattern.group(0)\n", - "# return pad_str(matched_string)\n", - "# text = re.sub('[\\!\\?\\@\\^\\+\\*\\/\\,\\~\\|\\`\\=\\:\\;\\.\\#\\\\\\]', pad_pattern, text) \n", - " \n", - " text = re.sub('[^\\x00-\\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word\n", - " \n", - " # indian dollar\n", - " \n", - " text = re.sub(\"(?<=[0-9])rs \", \" rs \", text, flags=re.IGNORECASE)\n", - " text = re.sub(\" rs(?=[0-9])\", \" rs \", text, flags=re.IGNORECASE)\n", - " \n", - " # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text\n", - " text = re.sub(r\" (the[\\s]+|The[\\s]+)?US(A)? \", \" America \", text)\n", - " text = re.sub(r\" UK \", \" England \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" india \", \" India \", text)\n", - " text = re.sub(r\" switzerland \", \" Switzerland \", text)\n", - " text = re.sub(r\" china \", \" China \", text)\n", - " text = re.sub(r\" chinese \", \" Chinese \", text) \n", - " text = re.sub(r\" imrovement \", \" improvement \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" intially \", \" initially \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" quora \", \" Quora \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" dms \", \" direct messages \", text, flags=re.IGNORECASE) \n", - " text = re.sub(r\" demonitization \", \" demonetization \", text, flags=re.IGNORECASE) \n", - " text = re.sub(r\" actived \", \" active \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" kms \", \" kilometers \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" cs \", \" computer science \", text, flags=re.IGNORECASE) \n", - " text = re.sub(r\" upvote\", \" up vote\", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" iPhone \", \" phone \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" \\0rs \", \" rs \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" calender \", \" calendar \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" ios \", \" operating system \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" gps \", \" GPS \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" gst \", \" GST \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" programing \", \" programming \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" bestfriend \", \" best friend \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" dna \", \" DNA \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" III \", \" 3 \", text)\n", - " text = re.sub(r\" banglore \", \" Banglore \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" J K \", \" JK \", text, flags=re.IGNORECASE)\n", - " text = re.sub(r\" J\\.K\\. \", \" JK \", text, flags=re.IGNORECASE)\n", - " \n", - " # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word \"number\"\n", - " \n", - " text = re.sub('[0-9]+\\.[0-9]+', \" 87 \", text)\n", - " \n", - " \n", - " # Remove punctuation from text\n", - " text = ''.join([c for c in text if c not in punctuation]).lower()\n", - " # Return a list of words\n", - " return text" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df['question1'] = df['question1'].apply(clean)\n", - "df['question2'] = df['question2'].apply(clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "what is the step by step guide to invest in share market in india\n", - "what is the step by step guide to invest in share market\n", - "\n", - "what is the story of kohinoor kohinoor diamond\n", - "what would happen if the indian government stole the kohinoor kohinoor diamond back\n", - "\n", - "how can i increase the speed of my internet connection while using a vpn\n", - "how can internet speed be increased by hacking through dns\n", - "\n", - "why am i mentally very lonely how can i solve it\n", - "find the remainder when math2324math is divided by 2423\n", - "\n", - "which one dissolve in water quikly sugar salt methane and carbon di oxide\n", - "which fish would survive in salt water\n", - "\n", - "astrology i am a capricorn sun cap moon and cap risingwhat does that say about me\n", - "i am a triple capricorn sun moon and ascendant in capricorn what does this say about me\n", - "\n", - "should i buy tiago\n", - "what keeps childern active and far from phone and video games\n", - "\n", - "how can i be a good geologist\n", - "what should i do to be a great geologist\n", - "\n", - "when do you use nonasciiword instead of nonasciiword \n", - "when do you use and instead of and\n", - "\n", - "motorola company can i hack my charter motorolla dcx3400\n", - "how do i hack motorola dcx3400 for free internet\n", - "\n" - ] - } - ], - "source": [ - "a = 0 \n", - "for i in range(a,a+10):\n", - " print(df.question1[i])\n", - " print(df.question2[i])\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### BOW + Xgboost Model" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "count_vect = CountVectorizer(analyzer='word', token_pattern=r'\\w{1,}')\n", - "count_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n", - "trainq1_trans = count_vect.transform(df['question1'].values)\n", - "trainq2_trans = count_vect.transform(df['question2'].values)\n", - "labels = df['is_duplicate'].values\n", - "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", - "y = labels\n", - "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n", - "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", - "xgb_prediction = xgb_model.predict(X_valid)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training score: 0.6177597850983563\n", - "validation score: 0.6154032008574583\n", - " precision recall f1-score support\n", - "\n", - " 0 0.70 0.95 0.80 84267\n", - " 1 0.77 0.30 0.43 49148\n", - "\n", - " micro avg 0.71 0.71 0.71 133415\n", - " macro avg 0.73 0.62 0.62 133415\n", - "weighted avg 0.72 0.71 0.66 133415\n", - "\n" - ] - } - ], - "source": [ - "from sklearn.metrics import f1_score, classification_report, accuracy_score\n", - "\n", - "print('training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", - "print('validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", - "print(classification_report(y_valid, xgb_prediction))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Word level TF-IDF" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', max_features=5000)\n", - "tfidf_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n", - "trainq1_trans = tfidf_vect.transform(df['question1'].values)\n", - "trainq2_trans = tfidf_vect.transform(df['question2'].values)\n", - "labels = df['is_duplicate'].values\n", - "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", - "y = labels\n", - "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "word level tf-idf training score: 0.8493408114951853\n", - "word level tf-idf validation score: 0.7576508867065961\n", - " precision recall f1-score support\n", - "\n", - " 0 0.79 0.90 0.84 84267\n", - " 1 0.77 0.60 0.67 49148\n", - "\n", - " micro avg 0.79 0.79 0.79 133415\n", - " macro avg 0.78 0.75 0.76 133415\n", - "weighted avg 0.79 0.79 0.78 133415\n", - "\n" - ] - } - ], - "source": [ - "from sklearn.metrics import f1_score, classification_report, accuracy_score\n", - "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", - "xgb_prediction = xgb_model.predict(X_valid)\n", - "print('word level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", - "print('word level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", - "print(classification_report(y_valid, xgb_prediction))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### N-gram Level TF-IDF" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n", - "tfidf_vect_ngram.fit(pd.concat((df['question1'],df['question2'])).unique())\n", - "trainq1_trans = tfidf_vect_ngram.transform(df['question1'].values)\n", - "trainq2_trans = tfidf_vect_ngram.transform(df['question2'].values)\n", - "labels = df['is_duplicate'].values\n", - "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", - "y = labels\n", - "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "n-gram level tf-idf training score: 0.7193864239031045\n", - "n-gram level tf-idf validation score: 0.67470696099733\n", - " precision recall f1-score support\n", - "\n", - " 0 0.73 0.92 0.81 84267\n", - " 1 0.75 0.42 0.54 49148\n", - "\n", - " micro avg 0.73 0.73 0.73 133415\n", - " macro avg 0.74 0.67 0.67 133415\n", - "weighted avg 0.74 0.73 0.71 133415\n", - "\n" - ] - } - ], - "source": [ - "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", - "xgb_prediction = xgb_model.predict(X_valid)\n", - "print('n-gram level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", - "print('n-gram level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", - "print(classification_report(y_valid, xgb_prediction))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Character Level TF-IDF " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "character level tf-idf training score: 0.9844717869102682\n", - "character level tf-idf validation score: 0.8008380950798113\n", - " precision recall f1-score support\n", - "\n", - " 0 0.83 0.91 0.87 84267\n", - " 1 0.81 0.67 0.74 49148\n", - "\n", - " micro avg 0.82 0.82 0.82 133415\n", - " macro avg 0.82 0.79 0.80 133415\n", - "weighted avg 0.82 0.82 0.82 133415\n", - "\n" - ] - } - ], - "source": [ - "from sklearn.metrics import f1_score, classification_report, accuracy_score\n", - "tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n", - "tfidf_vect_ngram_chars.fit(pd.concat((df['question1'],df['question2'])).unique())\n", - "trainq1_trans = tfidf_vect_ngram_chars.transform(df['question1'].values)\n", - "trainq2_trans = tfidf_vect_ngram_chars.transform(df['question2'].values)\n", - "labels = df['is_duplicate'].values\n", - "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n", - "y = labels\n", - "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n", - "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n", - "xgb_prediction = xgb_model.predict(X_valid)\n", - "print('character level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n", - "print('character level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n", - "print(classification_report(y_valid, xgb_prediction))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}