From 7c6d65c19fc5151ad292ba2bdf996aa5831c60e0 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Fri, 19 Oct 2018 14:59:28 -0400 Subject: [PATCH] Add notebook --- Quora_bow_tfidf_randomforest.ipynb | 993 +++++++++++++++++++++++++++++ 1 file changed, 993 insertions(+) create mode 100644 Quora_bow_tfidf_randomforest.ipynb diff --git a/Quora_bow_tfidf_randomforest.ipynb b/Quora_bow_tfidf_randomforest.ipynb new file mode 100644 index 0000000..2c0a05b --- /dev/null +++ b/Quora_bow_tfidf_randomforest.ipynb @@ -0,0 +1,993 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idqid1qid2question1question2is_duplicate
0012What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...0
1134What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...0
2256How can I increase the speed of my internet co...How can Internet speed be increased by hacking...0
3378Why am I mentally very lonely? How can I solve...Find the remainder when [math]23^{24}[/math] i...0
44910Which one dissolve in water quikly sugar, salt...Which fish would survive in salt water?0
551112Astrology: I am a Capricorn Sun Cap moon and c...I'm a triple Capricorn (Sun, Moon and ascendan...1
661314Should I buy tiago?What keeps childern active and far from phone ...0
771516How can I be a good geologist?What should I do to be a great geologist?1
881718When do you use シ instead of し?When do you use \"&\" instead of \"and\"?0
991920Motorola (company): Can I hack my Charter Moto...How do I hack Motorola DCX3400 for free internet?0
\n", + "
" + ], + "text/plain": [ + " id qid1 qid2 question1 \\\n", + "0 0 1 2 What is the step by step guide to invest in sh... \n", + "1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", + "2 2 5 6 How can I increase the speed of my internet co... \n", + "3 3 7 8 Why am I mentally very lonely? How can I solve... \n", + "4 4 9 10 Which one dissolve in water quikly sugar, salt... \n", + "5 5 11 12 Astrology: I am a Capricorn Sun Cap moon and c... \n", + "6 6 13 14 Should I buy tiago? \n", + "7 7 15 16 How can I be a good geologist? \n", + "8 8 17 18 When do you use シ instead of し? \n", + "9 9 19 20 Motorola (company): Can I hack my Charter Moto... \n", + "\n", + " question2 is_duplicate \n", + "0 What is the step by step guide to invest in sh... 0 \n", + "1 What would happen if the Indian government sto... 0 \n", + "2 How can Internet speed be increased by hacking... 0 \n", + "3 Find the remainder when [math]23^{24}[/math] i... 0 \n", + "4 Which fish would survive in salt water? 0 \n", + "5 I'm a triple Capricorn (Sun, Moon and ascendan... 1 \n", + "6 What keeps childern active and far from phone ... 0 \n", + "7 What should I do to be a great geologist? 1 \n", + "8 When do you use \"&\" instead of \"and\"? 0 \n", + "9 How do I hack Motorola DCX3400 for free internet? 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('quora_train.csv')\n", + "# df = df.sample(frac=0.10, random_state=99)\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "qid1 0\n", + "qid2 0\n", + "question1 1\n", + "question2 2\n", + "is_duplicate 0\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(404290, 6)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(axis=0, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(404287, 6)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "qid1 0\n", + "qid2 0\n", + "question1 0\n", + "question2 0\n", + "is_duplicate 0\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEHCAYAAABSjBpvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEjNJREFUeJzt3X+s3XV9x/Hny1acDhWUSliLK9EuE11EbYDofjBZoLBlxQ0y2CIdNqszkGiim2iygD9IMIuasSkLhEoxTmSoo3HV2iHOGRF6kQpU1N4gSi2BahFxRB343h/nc/VwOb3303srp3ifj+Sb8z3vz4/v5yRtX/n+OKepKiRJ6vGUcS9AkvTkYWhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSeq2eNwL2N8OO+ywWr58+biXIUlPKrfccsv3qmrJbP1+5UJj+fLlTExMjHsZkvSkkuTbPf28PCVJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqduv3Jf7niyWn/+f417Cr5S7L/7jcS9BWhBmPdNIcmSSG5LcmWR7kje2+oVJvptkW9tOHRrztiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f788JKkfdNzeeoR4M1V9SLgeODcJEe3tvdX1TFt2wTQ2s4EXgysAj6YZFGSRcAHgFOAo4GzhuZ5T5trBfAAsLbV1wIPVNULgfe3fpKkMZk1NKrq3qr6Stt/CLgTWDrDkNXA1VX1k6r6FjAJHNu2yaq6q6p+ClwNrE4S4NXAtW38BuC0obk2tP1rgRNbf0nSGOzTjfB2eehlwE2tdF6S25KsT3Joqy0F7hkatrPV9lZ/LvCDqnpkWv0xc7X2B1t/SdIYdIdGkoOBjwNvqqofApcCLwCOAe4F3jvVdcTwmkN9prmmr21dkokkE7t3757xc0iS5q4rNJI8lUFgfKSqPgFQVfdV1aNV9TPgcgaXn2BwpnDk0PBlwK4Z6t8DDkmyeFr9MXO19mcDe6avr6ouq6qVVbVyyZJZfw5ekjRHPU9PBbgCuLOq3jdUP2Ko22uAO9r+RuDM9uTTUcAK4GZgK7CiPSl1EIOb5RurqoAbgNPb+DXAdUNzrWn7pwOfa/0lSWPQ8z2NVwGvBW5Psq3V3s7g6adjGFwuuht4PUBVbU9yDfA1Bk9enVtVjwIkOQ/YDCwC1lfV9jbfW4Grk7wbuJVBSNFeP5xkksEZxpnz+KySpHmaNTSq6ouMvrewaYYxFwEXjahvGjWuqu7iF5e3hus/Bs6YbY2SpCeGPyMiSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKnbrKGR5MgkNyS5M8n2JG9s9eck2ZJkR3s9tNWT5JIkk0luS/LyobnWtP47kqwZqr8iye1tzCVJMtMxJEnj0XOm8Qjw5qp6EXA8cG6So4HzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVuVavv7RiSpDGYNTSq6t6q+krbfwi4E1gKrAY2tG4bgNPa/mrgqhr4MnBIkiOAk4EtVbWnqh4AtgCrWtuzqurGqirgqmlzjTqGJGkM9umeRpLlwMuAm4DDq+peGAQL8LzWbSlwz9Cwna02U33niDozHGP6utYlmUgysXv37n35SJKkfdAdGkkOBj4OvKmqfjhT1xG1mkO9W1VdVlUrq2rlkiVL9mWoJGkfdIVGkqcyCIyPVNUnWvm+dmmJ9np/q+8EjhwavgzYNUt92Yj6TMeQJI1Bz9NTAa4A7qyq9w01bQSmnoBaA1w3VD+7PUV1PPBgu7S0GTgpyaHtBvhJwObW9lCS49uxzp4216hjSJLGYHFHn1cBrwVuT7Kt1d4OXAxck2Qt8B3gjNa2CTgVmAQeBs4BqKo9Sd4FbG393llVe9r+G4ArgacDn24bMxxDkjQGs4ZGVX2R0fcdAE4c0b+Ac/cy13pg/Yj6BPCSEfXvjzqGJGk8/Ea4JKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSus0aGknWJ7k/yR1DtQuTfDfJtradOtT2tiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f760JKkuek507gSWDWi/v6qOqZtmwCSHA2cCby4jflgkkVJFgEfAE4BjgbOan0B3tPmWgE8AKxt9bXAA1X1QuD9rZ8kaYxmDY2q+gKwp3O+1cDVVfWTqvoWMAkc27bJqrqrqn4KXA2sThLg1cC1bfwG4LShuTa0/WuBE1t/SdKYzOeexnlJbmuXrw5ttaXAPUN9drba3urPBX5QVY9Mqz9mrtb+YOsvSRqTxXMcdynwLqDa63uB1wGjzgSK0eFUM/RnlrbHSLIOWAfw/Oc/f6Z1S5rNhc8e9wp+tVz44LhXsF/N6Uyjqu6rqker6mfA5QwuP8HgTOHIoa7LgF0z1L8HHJJk8bT6Y+Zq7c9mL5fJquqyqlpZVSuXLFkyl48kSeowp9BIcsTQ29cAU09WbQTObE8+HQWsAG4GtgIr2pNSBzG4Wb6xqgq4ATi9jV8DXDc015q2fzrwudZfkjQms16eSvJR4ATgsCQ7gQuAE5Icw+By0d3A6wGqanuSa4CvAY8A51bVo22e84DNwCJgfVVtb4d4K3B1kncDtwJXtPoVwIeTTDI4wzhz3p9WkjQvs4ZGVZ01onzFiNpU/4uAi0bUNwGbRtTv4heXt4brPwbOmG19kqQnjt8IlyR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUbdbQSLI+yf1J7hiqPSfJliQ72uuhrZ4klySZTHJbkpcPjVnT+u9Ismao/ookt7cxlyTJTMeQJI1Pz5nGlcCqabXzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVu1SzHkCSNyayhUVVfAPZMK68GNrT9DcBpQ/WrauDLwCFJjgBOBrZU1Z6qegDYAqxqbc+qqhurqoCrps016hiSpDGZ6z2Nw6vqXoD2+rxWXwrcM9RvZ6vNVN85oj7TMR4nybokE0kmdu/ePcePJEmazf6+EZ4RtZpDfZ9U1WVVtbKqVi5ZsmRfh0uSOs01NO5rl5Zor/e3+k7gyKF+y4Bds9SXjajPdAxJ0pjMNTQ2AlNPQK0Brhuqn92eojoeeLBdWtoMnJTk0HYD/CRgc2t7KMnx7amps6fNNeoYkqQxWTxbhyQfBU4ADkuyk8FTUBcD1yRZC3wHOKN13wScCkwCDwPnAFTVniTvAra2fu+sqqmb629g8ITW04FPt40ZjiFJGpNZQ6OqztpL04kj+hZw7l7mWQ+sH1GfAF4yov79UceQJI2P3wiXJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktRtXqGR5O4ktyfZlmSi1Z6TZEuSHe310FZPkkuSTCa5LcnLh+ZZ0/rvSLJmqP6KNv9kG5v5rFeSND/740zjD6vqmKpa2d6fD1xfVSuA69t7gFOAFW1bB1wKg5ABLgCOA44FLpgKmtZn3dC4VfthvZKkOfplXJ5aDWxo+xuA04bqV9XAl4FDkhwBnAxsqao9VfUAsAVY1dqeVVU3VlUBVw3NJUkag/mGRgGfTXJLknWtdnhV3QvQXp/X6kuBe4bG7my1meo7R9QfJ8m6JBNJJnbv3j3PjyRJ2pvF8xz/qqraleR5wJYkX5+h76j7ETWH+uOLVZcBlwGsXLlyZB9J0vzN60yjqna11/uBTzK4J3Ffu7REe72/dd8JHDk0fBmwa5b6shF1SdKYzDk0kvx6kmdO7QMnAXcAG4GpJ6DWANe1/Y3A2e0pquOBB9vlq83ASUkObTfATwI2t7aHkhzfnpo6e2guSdIYzOfy1OHAJ9tTsIuBf6uqzyTZClyTZC3wHeCM1n8TcCowCTwMnANQVXuSvAvY2vq9s6r2tP03AFcCTwc+3TZJ0pjMOTSq6i7gpSPq3wdOHFEv4Ny9zLUeWD+iPgG8ZK5rlCTtX34jXJLUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0O+NBIsirJN5JMJjl/3OuRpIXsgA6NJIuADwCnAEcDZyU5eryrkqSF64AODeBYYLKq7qqqnwJXA6vHvCZJWrAO9NBYCtwz9H5nq0mSxmDxuBcwi4yo1eM6JeuAde3tj5J845e6qoXlMOB7417EbPKeca9AY/Ck+LPJO0b9M3ZA+s2eTgd6aOwEjhx6vwzYNb1TVV0GXPZELWohSTJRVSvHvQ5pOv9sjseBfnlqK7AiyVFJDgLOBDaOeU2StGAd0GcaVfVIkvOAzcAiYH1VbR/zsiRpwTqgQwOgqjYBm8a9jgXMy346UPlncwxS9bj7ypIkjXSg39OQJB1ADA1JUrcD/p6GnjhJfpvBN+6XMvg+zC5gY1XdOdaFSTpgeKYhAJK8lcHPtAS4mcHjzgE+6g9FSprijXABkOSbwIur6v+m1Q8CtlfVivGsTJpZknOq6kPjXsdC4ZmGpvwM+I0R9SNam3Sgese4F7CQeE9DU94EXJ9kB7/4kcjnAy8EzhvbqiQgyW17awIOfyLXstB5eUo/l+QpDH6OfimDv4w7ga1V9ehYF6YFL8l9wMnAA9ObgC9V1aizZP0SeKahn6uqnwFfHvc6pBE+BRxcVdumNyT5/BO/nIXLMw1JUjdvhEuSuhkakqRuhoYWrCRfmuf4v07yL/MYf3eSw+azliSnJTl6rmuQ9pWhoQWrql457jVMmcdaTgMMDT1hDA0tWEl+1F6PSPKFJNuS3JHk92YYc06Sbyb5b+BVQ/Urk5w+Yu4T2tyfTPK1JP/aHm0euZa2//dJbk/y1SQXt9rfJNnaah9P8owkrwT+FPjHtvYXtO0zSW5J8j/t98Sk/cZHbiX4S2BzVV2UZBHwjFGdkhzB4NvHrwAeBG4Abu2Y/1gGZwPfBj4D/Blw7V6OcQqDs4fjqurhJM9pTZ+oqstbn3cDa6vqn5NsBD5VVde2tuuBv62qHUmOAz4IvLpjjVIXQ0Ma/Djj+iRPBf5j1HcBmuOAz1fVboAkHwN+q2P+m6vqrjbmo8DvspfQAP4I+FBVPQxQVXta/SUtLA4BDmbwXyA/RpKDgVcC/55kqvy0jvVJ3bw8pQWvqr4A/D7wXeDDSc6eqfte6o/Q/j5l8C/2QTOMmenLUdlL+5XAeVX1OwzOdn5tRJ+nAD+oqmOGthfNcCxpnxkaWvCS/CZwf7v8cwXw8r10vQk4Iclz21nJGUNtdzO4bAWD/5PkqUNtxyY5qt3L+AvgizMs57PA65I8o61t6vLUM4F723H/aqj/Q62Nqvoh8K0kZ7SxSfLSGY4l7TNDQ4ITgG1JbgX+HPinUZ2q6l7gQuBG4L+Arww1Xw78QZKbGVzG+t+hthuBi4E7gG8Bn9zbQqrqM8BGYCLJNuAtrekfGITWFuDrQ0OuBv4uya1JXsAgUNYm+SqwnUGASfuNPyMi/RIlOQF4S1X9ybjXIu0PnmlIkrp5piGNkOQmHv/k0Wur6vZxrEc6UBgakqRuXp6SJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1+38qOCjM258g7gAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.groupby(\"is_duplicate\")['id'].count().plot.bar()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The classes are not perfectly balanced, but it is not bad, we are not going to balance it. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(404287, 3)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preview a few question pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What is the step by step guide to invest in share market in india?\n", + "What is the step by step guide to invest in share market?\n", + "\n", + "What is the story of Kohinoor (Koh-i-Noor) Diamond?\n", + "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?\n", + "\n", + "How can I increase the speed of my internet connection while using a VPN?\n", + "How can Internet speed be increased by hacking through DNS?\n", + "\n", + "Why am I mentally very lonely? How can I solve it?\n", + "Find the remainder when [math]23^{24}[/math] is divided by 24,23?\n", + "\n", + "Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?\n", + "Which fish would survive in salt water?\n", + "\n", + "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?\n", + "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?\n", + "\n", + "Should I buy tiago?\n", + "What keeps childern active and far from phone and video games?\n", + "\n", + "How can I be a good geologist?\n", + "What should I do to be a great geologist?\n", + "\n", + "When do you use シ instead of し?\n", + "When do you use \"&\" instead of \"and\"?\n", + "\n", + "Motorola (company): Can I hack my Charter Motorolla DCX3400?\n", + "How do I hack Motorola DCX3400 for free internet?\n", + "\n" + ] + } + ], + "source": [ + "a = 0 \n", + "for i in range(a,a+10):\n", + " print(df.question1[i])\n", + " print(df.question2[i])\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There will be off a lot text cleaning to do.\n", + "\n", + "The words like \"what\", \"which\" and \"how\" may have signals here, so I decided not to remove stop words. And I do plan to stem the words." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "SPECIAL_TOKENS = {\n", + " 'quoted': 'quoted_item',\n", + " 'non-ascii': 'non_ascii_word',\n", + " 'undefined': 'something'\n", + "}\n", + "\n", + "def clean(text, stem_words=True):\n", + " import re\n", + " from string import punctuation\n", + " from nltk.stem import SnowballStemmer\n", + " from nltk.corpus import stopwords\n", + " \n", + " def pad_str(s):\n", + " return ' '+s+' '\n", + " \n", + " if pd.isnull(text):\n", + " return ''\n", + "\n", + "# stops = set(stopwords.words(\"english\"))\n", + " # Clean the text, with the option to stem words.\n", + " \n", + " # Empty question\n", + " \n", + " if type(text) != str or text=='':\n", + " return ''\n", + "\n", + " # Clean the text\n", + " text = re.sub(\"\\'s\", \" \", text) # we have cases like \"Sam is\" or \"Sam's\" (i.e. his) these two cases aren't separable, I choose to compromise are kill \"'s\" directly\n", + " text = re.sub(\" whats \", \" what is \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"\\'ve\", \" have \", text)\n", + " text = re.sub(\"can't\", \"can not\", text)\n", + " text = re.sub(\"n't\", \" not \", text)\n", + " text = re.sub(\"i'm\", \"i am\", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"\\'re\", \" are \", text)\n", + " text = re.sub(\"\\'d\", \" would \", text)\n", + " text = re.sub(\"\\'ll\", \" will \", text)\n", + " text = re.sub(\"e\\.g\\.\", \" eg \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"b\\.g\\.\", \" bg \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"(\\d+)(kK)\", \" \\g<1>000 \", text)\n", + " text = re.sub(\"e-mail\", \" email \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"(the[\\s]+|The[\\s]+)?U\\.S\\.A\\.\", \" America \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"(the[\\s]+|The[\\s]+)?United State(s)?\", \" America \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"\\(s\\)\", \" \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\"[c-fC-F]\\:\\/\", \" disk \", text)\n", + " \n", + " # remove comma between numbers, i.e. 15,000 -> 15000\n", + " \n", + " text = re.sub('(?<=[0-9])\\,(?=[0-9])', \"\", text)\n", + " \n", + "# # all numbers should separate from words, this is too aggressive\n", + " \n", + "# def pad_number(pattern):\n", + "# matched_string = pattern.group(0)\n", + "# return pad_str(matched_string)\n", + "# text = re.sub('[0-9]+', pad_number, text)\n", + " \n", + " # add padding to punctuations and special chars, we still need them later\n", + " \n", + " text = re.sub('\\$', \" dollar \", text)\n", + " text = re.sub('\\%', \" percent \", text)\n", + " text = re.sub('\\&', \" and \", text)\n", + " \n", + "# def pad_pattern(pattern):\n", + "# matched_string = pattern.group(0)\n", + "# return pad_str(matched_string)\n", + "# text = re.sub('[\\!\\?\\@\\^\\+\\*\\/\\,\\~\\|\\`\\=\\:\\;\\.\\#\\\\\\]', pad_pattern, text) \n", + " \n", + " text = re.sub('[^\\x00-\\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word\n", + " \n", + " # indian dollar\n", + " \n", + " text = re.sub(\"(?<=[0-9])rs \", \" rs \", text, flags=re.IGNORECASE)\n", + " text = re.sub(\" rs(?=[0-9])\", \" rs \", text, flags=re.IGNORECASE)\n", + " \n", + " # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text\n", + " text = re.sub(r\" (the[\\s]+|The[\\s]+)?US(A)? \", \" America \", text)\n", + " text = re.sub(r\" UK \", \" England \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" india \", \" India \", text)\n", + " text = re.sub(r\" switzerland \", \" Switzerland \", text)\n", + " text = re.sub(r\" china \", \" China \", text)\n", + " text = re.sub(r\" chinese \", \" Chinese \", text) \n", + " text = re.sub(r\" imrovement \", \" improvement \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" intially \", \" initially \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" quora \", \" Quora \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" dms \", \" direct messages \", text, flags=re.IGNORECASE) \n", + " text = re.sub(r\" demonitization \", \" demonetization \", text, flags=re.IGNORECASE) \n", + " text = re.sub(r\" actived \", \" active \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" kms \", \" kilometers \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" cs \", \" computer science \", text, flags=re.IGNORECASE) \n", + " text = re.sub(r\" upvote\", \" up vote\", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" iPhone \", \" phone \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" \\0rs \", \" rs \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" calender \", \" calendar \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" ios \", \" operating system \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" gps \", \" GPS \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" gst \", \" GST \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" programing \", \" programming \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" bestfriend \", \" best friend \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" dna \", \" DNA \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" III \", \" 3 \", text)\n", + " text = re.sub(r\" banglore \", \" Banglore \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" J K \", \" JK \", text, flags=re.IGNORECASE)\n", + " text = re.sub(r\" J\\.K\\. \", \" JK \", text, flags=re.IGNORECASE)\n", + " \n", + " # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word \"number\"\n", + " \n", + " text = re.sub('[0-9]+\\.[0-9]+', \" 87 \", text)\n", + " \n", + " \n", + " # Remove punctuation from text\n", + " text = ''.join([c for c in text if c not in punctuation]).lower()\n", + " # Return a list of words\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df['question1'] = df['question1'].apply(clean)\n", + "df['question2'] = df['question2'].apply(clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "what is the step by step guide to invest in share market in india\n", + "what is the step by step guide to invest in share market\n", + "\n", + "what is the story of kohinoor kohinoor diamond\n", + "what would happen if the indian government stole the kohinoor kohinoor diamond back\n", + "\n", + "how can i increase the speed of my internet connection while using a vpn\n", + "how can internet speed be increased by hacking through dns\n", + "\n", + "why am i mentally very lonely how can i solve it\n", + "find the remainder when math2324math is divided by 2423\n", + "\n", + "which one dissolve in water quikly sugar salt methane and carbon di oxide\n", + "which fish would survive in salt water\n", + "\n", + "astrology i am a capricorn sun cap moon and cap risingwhat does that say about me\n", + "i am a triple capricorn sun moon and ascendant in capricorn what does this say about me\n", + "\n", + "should i buy tiago\n", + "what keeps childern active and far from phone and video games\n", + "\n", + "how can i be a good geologist\n", + "what should i do to be a great geologist\n", + "\n", + "when do you use nonasciiword instead of nonasciiword \n", + "when do you use and instead of and\n", + "\n", + "motorola company can i hack my charter motorolla dcx3400\n", + "how do i hack motorola dcx3400 for free internet\n", + "\n" + ] + } + ], + "source": [ + "a = 0 \n", + "for i in range(a,a+10):\n", + " print(df.question1[i])\n", + " print(df.question2[i])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "X = df.loc[:, df.columns != 'is_duplicate']\n", + "y = df.loc[:, df.columns == 'is_duplicate']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A simple Bag Of Words model" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use FeatureUnion to combine the features from question1 and question2." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.preprocessing import FunctionTransformer\n", + "from sklearn.pipeline import FeatureUnion, Pipeline\n", + "\n", + "transformer = FeatureUnion([\n", + " ('question1_bow', \n", + " Pipeline([('extract_field',\n", + " FunctionTransformer(lambda x: x['question1'], \n", + " validate=False)),\n", + " ('bow', \n", + " CountVectorizer())])),\n", + " ('question2_bow', \n", + " Pipeline([('extract_field', \n", + " FunctionTransformer(lambda x: x['question2'], \n", + " validate=False)),\n", + " ('bow', \n", + " CountVectorizer())]))])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(283000, 128314)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train_count = transformer.fit_transform(X_train)\n", + "X_train_count.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", + " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.78 0.92 0.85 76647\n", + " 1 0.80 0.57 0.66 44640\n", + "\n", + " micro avg 0.79 0.79 0.79 121287\n", + " macro avg 0.79 0.74 0.76 121287\n", + "weighted avg 0.79 0.79 0.78 121287\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rf = RandomForestClassifier().fit(X_train_count, y_train.values.ravel())\n", + "X_test_count = transformer.transform(X_test)\n", + "y_pred = rf.predict(X_test_count)\n", + "from sklearn import metrics\n", + "print(metrics.classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.79 0.83 0.81 76647\n", + " 1 0.68 0.61 0.65 44640\n", + "\n", + " micro avg 0.75 0.75 0.75 121287\n", + " macro avg 0.73 0.72 0.73 121287\n", + "weighted avg 0.75 0.75 0.75 121287\n", + "\n" + ] + } + ], + "source": [ + "from sklearn import metrics\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logreg = LogisticRegression().fit(X_train_count, y_train.values.ravel())\n", + "X_test_count = transformer.transform(X_test)\n", + "y_pred = logreg.predict(X_test_count)\n", + "print(metrics.classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scaling Bag-of-Words with Tf-Idf Transformation" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.73 0.94 0.82 76647\n", + " 1 0.80 0.41 0.54 44640\n", + "\n", + " micro avg 0.74 0.74 0.74 121287\n", + " macro avg 0.76 0.67 0.68 121287\n", + "weighted avg 0.75 0.74 0.72 121287\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "tfidf_trfm = TfidfTransformer(norm=None)\n", + "X_train_count_tfidf = tfidf_trfm.fit_transform(X_train_count)\n", + "X_test_count_tfidf = tfidf_trfm.transform(X_test_count)\n", + "y_pred = rf.predict(X_test_count_tfidf)\n", + "print(metrics.classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.preprocessing import FunctionTransformer\n", + "from sklearn.pipeline import FeatureUnion, Pipeline\n", + "\n", + "transformer = FeatureUnion([\n", + " ('question1_tfidf', \n", + " Pipeline([('extract_field',\n", + " FunctionTransformer(lambda x: x['question1'], \n", + " validate=False)),\n", + " ('tfidf', \n", + " TfidfVectorizer())])),\n", + " ('question2_tfidf', \n", + " Pipeline([('extract_field', \n", + " FunctionTransformer(lambda x: x['question2'], \n", + " validate=False)),\n", + " ('tfidf', \n", + " TfidfVectorizer())]))])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_tfidf = transformer.fit_transform(X_train)\n", + "X_test_tfidf = transformer.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(283000, 128314)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train_tfidf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", + " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test score with bow features: 0.7880729179549334\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", + " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test score with tf-idf features: 0.784362709935937\n" + ] + } + ], + "source": [ + "def randomforest_classify(X_tr, y_tr, X_test, y_test, description):\n", + " ### Helper function to train a random forest classifier and score on test data\n", + " m = RandomForestClassifier().fit(X_tr, y_tr)\n", + " s = m.score(X_test, y_test)\n", + " print ('Test score with', description, 'features:', s)\n", + " return m\n", + "\n", + "m_bow = randomforest_classify(X_train_count, y_train.values.ravel(), X_test_count, y_test.values.ravel(), 'bow')\n", + "m_tfidf = randomforest_classify(X_train_tfidf, y_train.values.ravel(), X_test_tfidf, y_test.values.ravel(), 'tf-idf')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.78 0.86 0.82 76647\n", + " 1 0.71 0.58 0.64 44640\n", + "\n", + " micro avg 0.76 0.76 0.76 121287\n", + " macro avg 0.74 0.72 0.73 121287\n", + "weighted avg 0.75 0.76 0.75 121287\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logreg = LogisticRegression().fit(X_train_tfidf, y_train.values.ravel())\n", + "X_test_tfidf = transformer.transform(X_test)\n", + "y_pred = logreg.predict(X_test_tfidf)\n", + "print(metrics.classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", + " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.78 0.92 0.84 76647\n", + " 1 0.80 0.55 0.65 44640\n", + "\n", + " micro avg 0.78 0.78 0.78 121287\n", + " macro avg 0.79 0.74 0.75 121287\n", + "weighted avg 0.79 0.78 0.77 121287\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rf = RandomForestClassifier().fit(X_train_tfidf, y_train.values.ravel())\n", + "X_test_tfidf = transformer.transform(X_test)\n", + "y_pred = rf.predict(X_test_tfidf)\n", + "print(metrics.classification_report(y_test, y_pred))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}