add ipython file

qiu123 · deepakush · Mar 12, 2017 · Mar 12, 2017 · Mar 12, 2017 · Mar 12, 2017
commit 1767e8ce888b4cb8b22ed9844f1e9e6a8e8e8117
diff --git a/Applied-Text-Mining-In-Python/week2/Module+2+Python.ipynb b/Applied-Text-Mining-In-Python/week2/Module+2+Python.ipynb
@@ -0,0 +1,874 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 2 (Python 3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic NLP Tasks with NLTK"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "*** Introductory Examples for the NLTK Book ***\n",
+      "Loading text1, ..., text9 and sent1, ..., sent9\n",
+      "Type the name of the text or sentence to view it.\n",
+      "Type: 'texts()' or 'sents()' to list the materials.\n",
+      "text1: Moby Dick by Herman Melville 1851\n",
+      "text2: Sense and Sensibility by Jane Austen 1811\n",
+      "text3: The Book of Genesis\n",
+      "text4: Inaugural Address Corpus\n",
+      "text5: Chat Corpus\n",
+      "text6: Monty Python and the Holy Grail\n",
+      "text7: Wall Street Journal\n",
+      "text8: Personals Corpus\n",
+      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "from nltk.book import *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Counting vocabulary of words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Text: Wall Street Journal>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text7"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Pierre',\n",
+       " 'Vinken',\n",
+       " ',',\n",
+       " '61',\n",
+       " 'years',\n",
+       " 'old',\n",
+       " ',',\n",
+       " 'will',\n",
+       " 'join',\n",
+       " 'the',\n",
+       " 'board',\n",
+       " 'as',\n",
+       " 'a',\n",
+       " 'nonexecutive',\n",
+       " 'director',\n",
+       " 'Nov.',\n",
+       " '29',\n",
+       " '.']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sent7"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "18"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(sent7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100676"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(text7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12408"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(set(text7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['bottom',\n",
+       " 'Richmond',\n",
+       " 'tension',\n",
+       " 'limits',\n",
+       " 'Wedtech',\n",
+       " 'most',\n",
+       " 'boost',\n",
+       " '143.80',\n",
+       " 'Dale',\n",
+       " 'refunded']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(set(text7))[:10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Frequency of words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12408"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dist = FreqDist(text7)\n",
+    "len(dist)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "nltk.probability.FreqDist"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "FreqDist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab1 = dist.keys()\n",
+    "#vocab1[:10] \n",
+    "# In Python 3 dict.keys() returns an iterable view instead of a list\n",
+    "list(vocab1)[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "20"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dist['four']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['billion',\n",
+       " 'company',\n",
+       " 'president',\n",
+       " 'because',\n",
+       " 'market',\n",
+       " 'million',\n",
+       " 'shares',\n",
+       " 'trading',\n",
+       " 'program']"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]\n",
+    "freqwords"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Normalization and stemming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['list', 'listed', 'lists', 'listing', 'listings']"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input1 = \"List listed lists listing listings\"\n",
+    "words1 = input1.lower().split(' ')\n",
+    "words1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['list', 'list', 'list', 'list', 'list']"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "porter = nltk.PorterStemmer()\n",
+    "[porter.stem(t) for t in words1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Lemmatization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Universal',\n",
+       " 'Declaration',\n",
+       " 'of',\n",
+       " 'Human',\n",
+       " 'Rights',\n",
+       " 'Preamble',\n",
+       " 'Whereas',\n",
+       " 'recognition',\n",
+       " 'of',\n",
+       " 'the',\n",
+       " 'inherent',\n",
+       " 'dignity',\n",
+       " 'and',\n",
+       " 'of',\n",
+       " 'the',\n",
+       " 'equal',\n",
+       " 'and',\n",
+       " 'inalienable',\n",
+       " 'rights',\n",
+       " 'of']"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "udhr = nltk.corpus.udhr.words('English-Latin1')\n",
+    "udhr[:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['univers',\n",
+       " 'declar',\n",
+       " 'of',\n",
+       " 'human',\n",
+       " 'right',\n",
+       " 'preambl',\n",
+       " 'wherea',\n",
+       " 'recognit',\n",
+       " 'of',\n",
+       " 'the',\n",
+       " 'inher',\n",
+       " 'digniti',\n",
+       " 'and',\n",
+       " 'of',\n",
+       " 'the',\n",
+       " 'equal',\n",
+       " 'and',\n",
+       " 'inalien',\n",
+       " 'right',\n",
+       " 'of']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[porter.stem(t) for t in udhr[:20]] # Still Lemmatization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Universal',\n",
+       " 'Declaration',\n",
+       " 'of',\n",
+       " 'Human',\n",
+       " 'Rights',\n",
+       " 'Preamble',\n",
+       " 'Whereas',\n",
+       " 'recognition',\n",
+       " 'of',\n",
+       " 'the',\n",
+       " 'inherent',\n",
+       " 'dignity',\n",
+       " 'and',\n",
+       " 'of',\n",
+       " 'the',\n",
+       " 'equal',\n",
+       " 'and',\n",
+       " 'inalienable',\n",
+       " 'right',\n",
+       " 'of']"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "WNlemma = nltk.WordNetLemmatizer()\n",
+    "[WNlemma.lemmatize(t) for t in udhr[:20]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tokenization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Children', \"shouldn't\", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text11 = \"Children shouldn't drink a sugary drink before bed.\"\n",
+    "text11.split(' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Children',\n",
+       " 'should',\n",
+       " \"n't\",\n",
+       " 'drink',\n",
+       " 'a',\n",
+       " 'sugary',\n",
+       " 'drink',\n",
+       " 'before',\n",
+       " 'bed',\n",
+       " '.']"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nltk.word_tokenize(text11)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text12 = \"This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!\"\n",
+    "sentences = nltk.sent_tokenize(text12)\n",
+    "len(sentences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['This is the first sentence.',\n",
+       " 'A gallon of milk in the U.S. costs $2.99.',\n",
+       " 'Is this the third sentence?',\n",
+       " 'Yes, it is!']"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentences"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Advanced NLP Tasks with NLTK"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### POS tagging"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MD: modal auxiliary\n",
+      "    can cannot could couldn't dare may might must need ought shall should\n",
+      "    shouldn't will would\n"
+     ]
+    }
+   ],
+   "source": [
+    "nltk.help.upenn_tagset('MD')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Children', 'NNP'),\n",
+       " ('should', 'MD'),\n",
+       " (\"n't\", 'RB'),\n",
+       " ('drink', 'VB'),\n",
+       " ('a', 'DT'),\n",
+       " ('sugary', 'JJ'),\n",
+       " ('drink', 'NN'),\n",
+       " ('before', 'IN'),\n",
+       " ('bed', 'NN'),\n",
+       " ('.', '.')]"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text13 = nltk.word_tokenize(text11)\n",
+    "nltk.pos_tag(text13)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Visiting', 'VBG'),\n",
+       " ('aunts', 'NNS'),\n",
+       " ('can', 'MD'),\n",
+       " ('be', 'VB'),\n",
+       " ('a', 'DT'),\n",
+       " ('nuisance', 'NN')]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text14 = nltk.word_tokenize(\"Visiting aunts can be a nuisance\")\n",
+    "nltk.pos_tag(text14)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(S (NP Alice) (VP (V loves) (NP Bob)))\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Parsing sentence structure\n",
+    "text15 = nltk.word_tokenize(\"Alice loves Bob\")\n",
+    "grammar = nltk.CFG.fromstring(\"\"\"\n",
+    "S -> NP VP\n",
+    "VP -> V NP\n",
+    "NP -> 'Alice' | 'Bob'\n",
+    "V -> 'loves'\n",
+    "\"\"\")\n",
+    "\n",
+    "parser = nltk.ChartParser(grammar)\n",
+    "trees = parser.parse_all(text15)\n",
+    "for tree in trees:\n",
+    "    print(tree)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Grammar with 13 productions>"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text16 = nltk.word_tokenize(\"I saw the man with a telescope\")\n",
+    "grammar1 = nltk.data.load('mygrammar.cfg')\n",
+    "grammar1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(S\n",
+      "  (NP I)\n",
+      "  (VP\n",
+      "    (VP (V saw) (NP (Det the) (N man)))\n",
+      "    (PP (P with) (NP (Det a) (N telescope)))))\n",
+      "(S\n",
+      "  (NP I)\n",
+      "  (VP\n",
+      "    (V saw)\n",
+      "    (NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))\n"
+     ]
+    }
+   ],
+   "source": [
+    "parser = nltk.ChartParser(grammar1)\n",
+    "trees = parser.parse_all(text16)\n",
+    "for tree in trees:\n",
+    "    print(tree)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(S\n",
+      "  (NP-SBJ\n",
+      "    (NP (NNP Pierre) (NNP Vinken))\n",
+      "    (, ,)\n",
+      "    (ADJP (NP (CD 61) (NNS years)) (JJ old))\n",
+      "    (, ,))\n",
+      "  (VP\n",
+      "    (MD will)\n",
+      "    (VP\n",
+      "      (VB join)\n",
+      "      (NP (DT the) (NN board))\n",
+      "      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))\n",
+      "      (NP-TMP (NNP Nov.) (CD 29))))\n",
+      "  (. .))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from nltk.corpus import treebank\n",
+    "text17 = treebank.parsed_sents('wsj_0001.mrg')[0]\n",
+    "print(text17)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### POS tagging and parsing ambiguity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text18 = nltk.word_tokenize(\"The old man the boat\")\n",
+    "nltk.pos_tag(text18)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Colorless', 'NNP'),\n",
+       " ('green', 'JJ'),\n",
+       " ('ideas', 'NNS'),\n",
+       " ('sleep', 'VBP'),\n",
+       " ('furiously', 'RB')]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text19 = nltk.word_tokenize(\"Colorless green ideas sleep furiously\")\n",
+    "nltk.pos_tag(text19)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}