Fix comments on step1 and step2 notebook, add step5a to show how to s…

…plit up images and display them
qut-dmrc · Dec 10, 2015 · a35e934 · a35e934
1 parent d984cdd
commit a35e934
Show file tree

Hide file tree

Showing 4 changed files with 316 additions and 4 deletions.
diff --git a/step1.ipynb b/step1.ipynb
@@ -208,7 +208,7 @@
    },
    "outputs": [],
    "source": [
-    "# extract the image tag from the first item\n",
+    "# extract the sign name from the first item\n",
     "sign_heading = lotsofitems[1].find(\"strong\")"
    ]
   },
@@ -232,7 +232,7 @@
    },
    "outputs": [],
    "source": [
-    "# extract the name of the sign from the image tag\n",
+    "# extract the name of the sign inside the <strong></strong> tags\n",
     "sign_name = sign_heading.get_text()"
    ]
   },

diff --git a/step2.ipynb b/step2.ipynb
@@ -124,7 +124,7 @@
     "    if an_item.find('th'):        \n",
     "        print('skipping the header row')\n",
     "    else:\n",
-    "        # extract the alt attribute from the image tag\n",
+    "        # extract the name of the sign inside the <strong></strong> tags\n",
     "        theitem = an_item.find(\"strong\").get_text()\n",
     "    \n",
     "        # add the item to a list\n",

diff --git a/step5.ipynb b/step5.ipynb
@@ -259,7 +259,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we are ready to move onto the sixth notebook - [Restructure the code for clarity](step6.ipynb)"
+    "Now we are ready to move onto the sixth notebook - [Restructure the code for clarity](step6.ipynb)\n",
+    "\n",
+    "Added extra notebook - 5a - to show how to create a row in the table for every image by duplicating the sign name, sign type and sign description columns - [Row for each image](step5a.ipynb)"
    ]
   }
  ],

diff --git a/step5a.ipynb b/step5a.ipynb
@@ -0,0 +1,310 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ODI Queensland workshop - Web Scraping \n",
+    "\n",
+    "## QUT DMRC - 2015\n",
+    "\n",
+    "### Store the data in a dataframe with row for each image and save to disk\n",
+    "\n",
+    "This notebook extends the ``get_itemlist`` function to save all the data in a dataframe for each image by duplicating the other sign name and sign description for each image in the row.\n",
+    "\n",
+    "It then displays the images as thumbnails in a table."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Import packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import bs4\n",
+    "import requests\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Scrape the page"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# this is the base_url\n",
+    "base_url = \"http://www.qld.gov.au/transport/safety/signs/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# select which page to scrape based on the type of road sign\n",
+    "sign_type = \"regulatory\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# build the url\n",
+    "thepage = base_url + sign_type + '/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# call the url\n",
+    "stuff = requests.get(thepage)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# transform to soup using lxml parser\n",
+    "soup = bs4.BeautifulSoup(stuff.text, \"lxml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define column labels for the dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# columns labels\n",
+    "colnames = [\"image_name\", \"url\", \"sign_name\", \"sign_type\", \"description\"]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "### Function definitions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# processes a beautiful_soup data structure and returns new signs in a dataframe\n",
+    "def get_itemlist(thesoup):\n",
+    "    \n",
+    "    # find all the tables on the page\n",
+    "    tables = thesoup.findAll('table')\n",
+    "    thelist = []\n",
+    "\n",
+    "    for table in tables:\n",
+    "        # find all the table rows\n",
+    "        lotsofitems = table.findAll('tr')\n",
+    "\n",
+    "        # check if the first row contains a 'th' elements (table header)\n",
+    "        if lotsofitems[0].find('th'): \n",
+    "\n",
+    "            # get all header elements\n",
+    "            temp = lotsofitems[0].findAll('th')\n",
+    "\n",
+    "            # check that the table header has the text we expect for the signs table\n",
+    "            if temp[0].get_text() == 'Sign' and temp[1].get_text() == 'Meaning':\n",
+    "\n",
+    "                # print('Traffic sign table found')\n",
+    "\n",
+    "                # process the table of traffic signs **** THIS IS THE UPDATED SECTION *****\n",
+    "                for an_item in lotsofitems[1:]:\n",
+    "\n",
+    "                    # sign description & title\n",
+    "                    sign_text = an_item.findAll(\"p\")\n",
+    "                    description = ''\n",
+    "                    for para in sign_text:\n",
+    "                        if para.find(\"strong\"):\n",
+    "                            # extract sign name (this assumes only the sign name is in bold)\n",
+    "                            temptemp = para.find(\"strong\").get_text()\n",
+    "                            temptemp = temptemp.split()\n",
+    "                            sign_name = \" \".join(temptemp)\n",
+    "                        else:\n",
+    "                            # extract sign description (may be multiple paragraphs)\n",
+    "                            temptemp = para.get_text()\n",
+    "                            temptemp = temptemp.split()\n",
+    "                            description += \" \".join(temptemp) + '\\n'\n",
+    "\n",
+    "                    # sign images (may be more than one image per sign name) - save image name & image url\n",
+    "                    for image in an_item.findAll(\"img\"):\n",
+    "                        # get the image name & image url\n",
+    "                        theitem = []\n",
+    "                        theitem += [image.attrs['alt']]\n",
+    "                        theitem += [image.attrs['src']]\n",
+    "                        theitem += [sign_name]\n",
+    "                        theitem += [sign_type]\n",
+    "                        theitem += [description]\n",
+    "                        \n",
+    "                        thelist += [theitem]\n",
+    "\n",
+    "                        \n",
+    "#            else:\n",
+    "#                print('Different table - with header row:', temp)\n",
+    "#        else:\n",
+    "#            print('Different table - no header row:', lotsofitems[0])\n",
+    "\n",
+    "    return pd.DataFrame(thelist,columns=colnames)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Call the function\n",
+    "\n",
+    "Call the function passing it the ``soup`` variable and store the result in the variable ```signs```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "signs = get_itemlist(soup)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# have a look\n",
+    "signs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Display thumbnails of the images in the table\n",
+    "\n",
+    "To do this we convert the dataframe to html and display the html instead of using the built in display option."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import HTML\n",
+    "\n",
+    "signs['image_full'] = signs.url.map(lambda x: '<img src=\"http://www.qld.gov.au{0}\", width=50>'.format(x))\n",
+    "\n",
+    "# have to expand the columns to prevent the image url being truncated\n",
+    "pd.set_option('display.max_colwidth', -1)\n",
+    "html = HTML(signs.to_html(escape=False, index=False))\n",
+    "pd.set_option('display.max_colwidth', 80)\n",
+    "html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save to disk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# save the data as a csv file\n",
+    "signs.to_csv(\"signs5a.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we are ready to move onto the sixth notebook - [Restructure the code for clarity](step6.ipynb)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}