diff --git a/.ipynb_checkpoints/3a. Taxon Autocorrect with LSTM Autoencoders-checkpoint.ipynb b/.ipynb_checkpoints/3a. Taxon Autocorrect with LSTM Autoencoders-checkpoint.ipynb
new file mode 100644
index 0000000..35d545b
--- /dev/null
+++ b/.ipynb_checkpoints/3a. Taxon Autocorrect with LSTM Autoencoders-checkpoint.ipynb
@@ -0,0 +1,1425 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3a. Autocorrecting Mispelt Taxon Names with Autoencoders\n",
+ "Given a list of taxon names, can we build an autocorrect model to autonomously fix erroneous records?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 184,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib\n",
+ "import random\n",
+ "import string\n",
+ "from keras.models import Model\n",
+ "from keras.preprocessing import sequence\n",
+ "from keras.layers import Input, LSTM, Dense"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Exploring the dataset\n",
+ "We'll use the same dataset as last time; a publically available list of UK exports from 1975 - 2016. We'll only need the taxon names so we'll restrict our import to the taxon column."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 185,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Taxon | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Varanus flavescens | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Varanus griseus | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Branta ruficollis | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Leopardus pardalis | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Leopardus wiedii | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Diceros bicornis | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Asarcornis scutulata | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Branta sandvicensis | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Branta sandvicensis | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Cercopithecus diana | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Rucervus duvaucelii | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Crocodylus siamensis | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Falco peregrinus | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Acinonyx jubatus | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Catopuma temminckii | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " Leopardus jacobitus | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Leopardus pardalis mearnsi | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 49339 | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 49340 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49341 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49342 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49343 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49344 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49345 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49346 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49347 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49348 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49349 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49350 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49351 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49352 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49353 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49354 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49355 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49356 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49357 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49358 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49359 | \n",
+ " Lodoicea maldivica | \n",
+ "
\n",
+ " \n",
+ " 49360 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49361 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49362 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49363 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49364 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49365 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49366 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49367 | \n",
+ " Alligator mississippiensis | \n",
+ "
\n",
+ " \n",
+ " 49368 | \n",
+ " Varanus salvator | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
49369 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Taxon\n",
+ "0 Equus przewalskii\n",
+ "1 Panthera onca\n",
+ "2 Varanus flavescens\n",
+ "3 Varanus griseus\n",
+ "4 Branta ruficollis\n",
+ "5 Leopardus pardalis\n",
+ "6 Leopardus wiedii\n",
+ "7 Diceros bicornis\n",
+ "8 Asarcornis scutulata\n",
+ "9 Branta sandvicensis\n",
+ "10 Branta sandvicensis\n",
+ "11 Cercopithecus diana\n",
+ "12 Rucervus duvaucelii\n",
+ "13 Crocodylus siamensis\n",
+ "14 Elephas maximus\n",
+ "15 Elephas maximus\n",
+ "16 Elephas maximus\n",
+ "17 Elephas maximus\n",
+ "18 Equus przewalskii\n",
+ "19 Falco peregrinus\n",
+ "20 Acinonyx jubatus\n",
+ "21 Catopuma temminckii\n",
+ "22 Leopardus jacobitus\n",
+ "23 Leopardus pardalis mearnsi\n",
+ "24 Panthera onca\n",
+ "25 Panthera onca\n",
+ "26 Panthera onca\n",
+ "27 Panthera onca\n",
+ "28 Panthera onca\n",
+ "29 Panthera onca\n",
+ "... ...\n",
+ "49339 Martes flavigula\n",
+ "49340 Mustela sibirica\n",
+ "49341 Mustela sibirica\n",
+ "49342 Mustela sibirica\n",
+ "49343 Mustela sibirica\n",
+ "49344 Mustela sibirica\n",
+ "49345 Mustela sibirica\n",
+ "49346 Mustela sibirica\n",
+ "49347 Mustela sibirica\n",
+ "49348 Mustela sibirica\n",
+ "49349 Mustela sibirica\n",
+ "49350 Mustela sibirica\n",
+ "49351 Odobenus rosmarus\n",
+ "49352 Odobenus rosmarus\n",
+ "49353 Odobenus rosmarus\n",
+ "49354 Odobenus rosmarus\n",
+ "49355 Odobenus rosmarus\n",
+ "49356 Odobenus rosmarus\n",
+ "49357 Odobenus rosmarus\n",
+ "49358 Odobenus rosmarus\n",
+ "49359 Lodoicea maldivica\n",
+ "49360 Pavo cristatus\n",
+ "49361 Pavo cristatus\n",
+ "49362 Pavo cristatus\n",
+ "49363 Pavo cristatus\n",
+ "49364 Pavo cristatus\n",
+ "49365 Pavo cristatus\n",
+ "49366 Pavo cristatus\n",
+ "49367 Alligator mississippiensis\n",
+ "49368 Varanus salvator\n",
+ "\n",
+ "[49369 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 185,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataframe = pd.read_csv(\"data/goal_2_data.csv\", skipinitialspace=True, usecols=[\"Taxon\"])\n",
+ "\n",
+ "dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 186,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Loxodonta africana 3606\n",
+ "Python reticulatus 1523\n",
+ "Alligator mississippiensis 1297\n",
+ "Macaca fascicularis 1279\n",
+ "Varanus salvator 972\n",
+ "Elephas maximus 952\n",
+ "Cheloniidae spp. 866\n",
+ "Varanus niloticus 744\n",
+ "Elephantidae spp. 716\n",
+ "Crocodylus niloticus 685\n",
+ "Psittacus erithacus 632\n",
+ "Crocodylus porosus 560\n",
+ "Caiman crocodilus crocodilus 524\n",
+ "Python bivittatus 501\n",
+ "Ptyas mucosus 473\n",
+ "Chlorocebus aethiops 457\n",
+ "Falco peregrinus 403\n",
+ "Eretmochelys imbricata 393\n",
+ "Dalbergia nigra 354\n",
+ "Vicugna vicugna 337\n",
+ "Panthera pardus 325\n",
+ "Callithrix jacchus 323\n",
+ "Odobenus rosmarus 299\n",
+ "Falco rusticolus 296\n",
+ "Panthera tigris 283\n",
+ "Physeter macrocephalus 255\n",
+ "Hirudo medicinalis 249\n",
+ "Macaca mulatta 232\n",
+ "Crocodylus novaeguineae 202\n",
+ "Leopardus pardalis 201\n",
+ " ... \n",
+ "Micrastur ruficollis 1\n",
+ "Hydnophora spp. 1\n",
+ "Lycaste fulvescens 1\n",
+ "Errinopora pourtalesii 1\n",
+ "Maihueniopsis darwinii 1\n",
+ "Porites divaricata 1\n",
+ "Aloe trachyticola 1\n",
+ "Polemaetus bellicosus 1\n",
+ "Sternbergia candida 1\n",
+ "Errinopora spp. 1\n",
+ "Dracula tubeana 1\n",
+ "Chinchilla lanigera 1\n",
+ "Peniocereus spp. 1\n",
+ "Mesoplodon europaeus 1\n",
+ "Cypripedium yunnanense 1\n",
+ "Nectophrynoides minutus 1\n",
+ "Vidua paradisaea 1\n",
+ "Bulbophyllum resupinatum 1\n",
+ "Turbinicarpus mandragora 1\n",
+ "Dalbergia retusa 1\n",
+ "Pristis spp. 1\n",
+ "Masdevallia andreettaeana 1\n",
+ "Dendrobium violaceum 1\n",
+ "Favites abdita 1\n",
+ "Astrophytum myriostigma 1\n",
+ "Epiphyllum pumilum 1\n",
+ "Pterostylis fischii 1\n",
+ "Colpophyllia amaranthus 1\n",
+ "Acineta chrysantha 1\n",
+ "Anas spp. 1\n",
+ "Name: Taxon, Length: 3422, dtype: int64"
+ ]
+ },
+ "execution_count": 186,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "name_distribution = dataframe[\"Taxon\"].value_counts()\n",
+ "name_distribution"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 187,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 187,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "name_distribution.head(50).plot.bar(figsize=(10, 10), title=\"Top 10 Taxon Names\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Obviously we can't use this for our training set as our model would disproportionatly learn to correct everything to \"Loxodonta africana\". We'll have to create a dataset of unique names..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 188,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total taxon names: 3422\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array(['Equus przewalskii', 'Panthera onca', 'Varanus flavescens', ...,\n",
+ " 'Phaethornis longirostris', 'Mesoplodon stejnegeri',\n",
+ " 'Martes flavigula'], dtype=object)"
+ ]
+ },
+ "execution_count": 188,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "names = dataframe[\"Taxon\"].unique()\n",
+ "\n",
+ "print(\"Total taxon names: \", len(names))\n",
+ "names"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we have a list of 3422 names that we can train our model on, but we'll need to generate some fake spelling mistakes first, so let's write a function to do that..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 189,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Lxodonta afkicana\n"
+ ]
+ }
+ ],
+ "source": [
+ "def spelling_mistake_generator(name):\n",
+ " n = random.randint(0,4)\n",
+ " if n == 0:\n",
+ " return remove_letter(name)\n",
+ " elif n == 1:\n",
+ " return add_letter(name)\n",
+ " elif n == 2:\n",
+ " return swap_letters(name)\n",
+ " elif n == 3:\n",
+ " name = remove_letter(name)\n",
+ " return add_letter(name)\n",
+ " elif n == 4:\n",
+ " return lowercase(name)\n",
+ "\n",
+ "def remove_letter(name):\n",
+ " random_slice = random.randint(0, len(name))\n",
+ " generated_name = name[:random_slice] + name[(random_slice + 1):]\n",
+ " return generated_name\n",
+ "\n",
+ "def add_letter(name):\n",
+ " random_slice = random.randint(0, len(name))\n",
+ " random_letter = random.choice(string.ascii_letters)\n",
+ " generated_name = name[:random_slice] + random_letter + name[(random_slice + 1):]\n",
+ " return generated_name\n",
+ "\n",
+ "def swap_letters(name):\n",
+ " random_slice = random.randint(0, len(name) - 2)\n",
+ " generated_name = name[:random_slice] + reversed_string(name[random_slice:random_slice + 2]) + name[random_slice + 2:]\n",
+ " return generated_name\n",
+ " \n",
+ "def reversed_string(a_string):\n",
+ " return a_string[::-1]\n",
+ "\n",
+ "def lowercase(name):\n",
+ " return name.lower()\n",
+ "\n",
+ "print(spelling_mistake_generator(\"Loxodonta africana\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 190,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loxodonta aLricana\n",
+ "Looxdonta africana\n",
+ "loxodonta africana\n",
+ "Loxodnta afrTcana\n",
+ "loxodonta africana\n",
+ "Loxodonta africMna\n",
+ "Loxodontaafricana\n",
+ "Loxodonta afrianaZ\n",
+ "Loxodonta africaq\n",
+ "loxodonta africana\n"
+ ]
+ }
+ ],
+ "source": [
+ "for i in range(10):\n",
+ " print(spelling_mistake_generator(\"Loxodonta africana\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Generating our Rosetta Stone\n",
+ "\n",
+ "We'll use our new spelling mistake generator to generate a new dataset where we have the erroneous data in one column, and the correct data next to it. Since we get a different mistake each time we run the generator, we'll create 100 examples of each term..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 191,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Input | \n",
+ " Target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Equus przeawlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Equs przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Equusprzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Equus przealskiis | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Equus prUewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Equus prlewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Equus przewlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " EquusPprzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Eqlus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Equus pzrewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Equus przeawlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Equusprzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Equus przewlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Equus przewalskji | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Equs przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Equus przewlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Equus pzrewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Equus przewalsNii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Equus przewalsiki | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Equus przealskiu | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Equus przeawlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Equus przealWkii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Equusp rzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Equus prEewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " squus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 342170 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342171 | \n",
+ " Martes fwavgula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342172 | \n",
+ " Martek flvigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342173 | \n",
+ " Martes fwvigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342174 | \n",
+ " Martes flaigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342175 | \n",
+ " Martes flavigul | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342176 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342177 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342178 | \n",
+ " Martes flaJigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342179 | \n",
+ " partes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342180 | \n",
+ " Martes lfavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342181 | \n",
+ " MarteP flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342182 | \n",
+ " Martes flvibula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342183 | \n",
+ " Martse flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342184 | \n",
+ " Martes flaviula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342185 | \n",
+ " Martes flavigual | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342186 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342187 | \n",
+ " Marts flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342188 | \n",
+ " Martesflavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342189 | \n",
+ " Martes flNvigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342190 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342191 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342192 | \n",
+ " Maxtes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342193 | \n",
+ " Martes flavigulaa | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342194 | \n",
+ " Martes flavigXla | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342195 | \n",
+ " aMrtes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342196 | \n",
+ " MartesLflavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342197 | \n",
+ " Marets flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342198 | \n",
+ " Martes flaviula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342199 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
342200 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Input Target\n",
+ "0 Equus przeawlskii Equus przewalskii\n",
+ "1 Equs przewalskii Equus przewalskii\n",
+ "2 Equusprzewalskii Equus przewalskii\n",
+ "3 Equus przealskiis Equus przewalskii\n",
+ "4 Equus prUewalskii Equus przewalskii\n",
+ "5 Equus prlewalskii Equus przewalskii\n",
+ "6 equus przewalskii Equus przewalskii\n",
+ "7 Equus przewlskii Equus przewalskii\n",
+ "8 EquusPprzewalskii Equus przewalskii\n",
+ "9 Eqlus przewalskii Equus przewalskii\n",
+ "10 Equus pzrewalskii Equus przewalskii\n",
+ "11 Equus przeawlskii Equus przewalskii\n",
+ "12 Equusprzewalskii Equus przewalskii\n",
+ "13 Equus przewlskii Equus przewalskii\n",
+ "14 Equus przewalskji Equus przewalskii\n",
+ "15 Equs przewalskii Equus przewalskii\n",
+ "16 Equus przewlskii Equus przewalskii\n",
+ "17 Equus pzrewalskii Equus przewalskii\n",
+ "18 equus przewalskii Equus przewalskii\n",
+ "19 Equus przewalsNii Equus przewalskii\n",
+ "20 Equus przewalsiki Equus przewalskii\n",
+ "21 Equus przealskiu Equus przewalskii\n",
+ "22 equus przewalskii Equus przewalskii\n",
+ "23 Equus przeawlskii Equus przewalskii\n",
+ "24 Equus przealWkii Equus przewalskii\n",
+ "25 equus przewalskii Equus przewalskii\n",
+ "26 Equusp rzewalskii Equus przewalskii\n",
+ "27 Equus prEewalskii Equus przewalskii\n",
+ "28 equus przewalskii Equus przewalskii\n",
+ "29 squus przewalskii Equus przewalskii\n",
+ "... ... ...\n",
+ "342170 martes flavigula Martes flavigula\n",
+ "342171 Martes fwavgula Martes flavigula\n",
+ "342172 Martek flvigula Martes flavigula\n",
+ "342173 Martes fwvigula Martes flavigula\n",
+ "342174 Martes flaigula Martes flavigula\n",
+ "342175 Martes flavigul Martes flavigula\n",
+ "342176 martes flavigula Martes flavigula\n",
+ "342177 martes flavigula Martes flavigula\n",
+ "342178 Martes flaJigula Martes flavigula\n",
+ "342179 partes flavigula Martes flavigula\n",
+ "342180 Martes lfavigula Martes flavigula\n",
+ "342181 MarteP flavigula Martes flavigula\n",
+ "342182 Martes flvibula Martes flavigula\n",
+ "342183 Martse flavigula Martes flavigula\n",
+ "342184 Martes flaviula Martes flavigula\n",
+ "342185 Martes flavigual Martes flavigula\n",
+ "342186 martes flavigula Martes flavigula\n",
+ "342187 Marts flavigula Martes flavigula\n",
+ "342188 Martesflavigula Martes flavigula\n",
+ "342189 Martes flNvigula Martes flavigula\n",
+ "342190 martes flavigula Martes flavigula\n",
+ "342191 martes flavigula Martes flavigula\n",
+ "342192 Maxtes flavigula Martes flavigula\n",
+ "342193 Martes flavigulaa Martes flavigula\n",
+ "342194 Martes flavigXla Martes flavigula\n",
+ "342195 aMrtes flavigula Martes flavigula\n",
+ "342196 MartesLflavigula Martes flavigula\n",
+ "342197 Marets flavigula Martes flavigula\n",
+ "342198 Martes flaviula Martes flavigula\n",
+ "342199 martes flavigula Martes flavigula\n",
+ "\n",
+ "[342200 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 191,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "repeated_names = np.repeat(names, 100)\n",
+ "spelling_errors = [spelling_mistake_generator(s) for s in repeated_names]\n",
+ "\n",
+ "corpus = np.column_stack((spelling_errors, repeated_names))\n",
+ "\n",
+ "corpus = pd.DataFrame(corpus, columns=[\"Input\", \"Target\"])\n",
+ "corpus"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We've ended up with 342200 records of names with minor typos and formatting mistakes. (You can change the number of repetitions from 100 to 1000 if you need more data, but I've left this as 100 to save time. 1000 and upwards takes a little while to generate)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Vectorise our data for learning\n",
+ "\n",
+ "We'll need to encode our data from letters to number for our model to be able to deal with it. We'll take our table, and create two lists, one of all the characters in our input dataset, and one of all the characters in our target dataset. We'll also add a start and end character to our target data as this will be useful for our model to understand when to start and stop generating..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 192,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "input_texts = []\n",
+ "target_texts = []\n",
+ "\n",
+ "start_character = '\\t'\n",
+ "end_character = '\\n'\n",
+ "\n",
+ "input_characters = set()\n",
+ "target_characters = set()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 193,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Takes in the input and target texts and adds their characters to the list of input and target characters\n",
+ "\n",
+ "def build_character_lists(input_text, target_text):\n",
+ " for char in input_text:\n",
+ " if char not in input_characters:\n",
+ " input_characters.add(char)\n",
+ " for char in target_text:\n",
+ " if char not in target_characters:\n",
+ " target_characters.add(char)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 194,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for index, row in corpus.iterrows():\n",
+ " input_text = row[\"Input\"]\n",
+ " target_text = row[\"Target\"]\n",
+ " target_text = start_character + target_text + end_character\n",
+ " build_character_lists(input_text, target_text)\n",
+ " input_texts.append(input_text)\n",
+ " target_texts.append(target_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 195,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of samples: 342200\n",
+ "Number of unique input tokens: 55\n",
+ "Number of unique output tokens: 56\n",
+ "Max sequence length for inputs: 36\n",
+ "Max sequence length for outputs: 38\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['\\t',\n",
+ " '\\n',\n",
+ " ' ',\n",
+ " '-',\n",
+ " '.',\n",
+ " 'A',\n",
+ " 'B',\n",
+ " 'C',\n",
+ " 'D',\n",
+ " 'E',\n",
+ " 'F',\n",
+ " 'G',\n",
+ " 'H',\n",
+ " 'I',\n",
+ " 'J',\n",
+ " 'K',\n",
+ " 'L',\n",
+ " 'M',\n",
+ " 'N',\n",
+ " 'O',\n",
+ " 'P',\n",
+ " 'Q',\n",
+ " 'R',\n",
+ " 'S',\n",
+ " 'T',\n",
+ " 'U',\n",
+ " 'V',\n",
+ " 'W',\n",
+ " 'X',\n",
+ " 'Z',\n",
+ " 'a',\n",
+ " 'b',\n",
+ " 'c',\n",
+ " 'd',\n",
+ " 'e',\n",
+ " 'f',\n",
+ " 'g',\n",
+ " 'h',\n",
+ " 'i',\n",
+ " 'j',\n",
+ " 'k',\n",
+ " 'l',\n",
+ " 'm',\n",
+ " 'n',\n",
+ " 'o',\n",
+ " 'p',\n",
+ " 'q',\n",
+ " 'r',\n",
+ " 's',\n",
+ " 't',\n",
+ " 'u',\n",
+ " 'v',\n",
+ " 'w',\n",
+ " 'x',\n",
+ " 'y',\n",
+ " 'z']"
+ ]
+ },
+ "execution_count": 195,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "input_characters = sorted(list(input_characters))\n",
+ "target_characters = sorted(list(target_characters))\n",
+ "num_encoder_tokens = len(input_characters)\n",
+ "num_decoder_tokens = len(target_characters)\n",
+ "max_encoder_seq_length = max([len(txt) for txt in input_texts])\n",
+ "max_decoder_seq_length = max([len(txt) for txt in target_texts])\n",
+ "\n",
+ "print('Number of samples:', len(input_texts))\n",
+ "print('Number of unique input tokens:', num_encoder_tokens)\n",
+ "print('Number of unique output tokens:', num_decoder_tokens)\n",
+ "print('Max sequence length for inputs:', max_encoder_seq_length)\n",
+ "print('Max sequence length for outputs:', max_decoder_seq_length)\n",
+ "\n",
+ "target_characters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We'll create two dictionaries to help us get from our characters to numbers and back for both our input and target dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 198,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{' ': 0, '-': 1, '.': 2, 'A': 3, 'B': 4, 'C': 5, 'D': 6, 'E': 7, 'F': 8, 'G': 9, 'H': 10, 'I': 11, 'J': 12, 'K': 13, 'L': 14, 'M': 15, 'N': 16, 'O': 17, 'P': 18, 'Q': 19, 'R': 20, 'S': 21, 'T': 22, 'U': 23, 'V': 24, 'W': 25, 'X': 26, 'Y': 27, 'Z': 28, 'a': 29, 'b': 30, 'c': 31, 'd': 32, 'e': 33, 'f': 34, 'g': 35, 'h': 36, 'i': 37, 'j': 38, 'k': 39, 'l': 40, 'm': 41, 'n': 42, 'o': 43, 'p': 44, 'q': 45, 'r': 46, 's': 47, 't': 48, 'u': 49, 'v': 50, 'w': 51, 'x': 52, 'y': 53, 'z': 54}\n",
+ "{'\\t': 0, '\\n': 1, ' ': 2, '-': 3, '.': 4, 'A': 5, 'B': 6, 'C': 7, 'D': 8, 'E': 9, 'F': 10, 'G': 11, 'H': 12, 'I': 13, 'J': 14, 'K': 15, 'L': 16, 'M': 17, 'N': 18, 'O': 19, 'P': 20, 'Q': 21, 'R': 22, 'S': 23, 'T': 24, 'U': 25, 'V': 26, 'W': 27, 'X': 28, 'Z': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}\n"
+ ]
+ }
+ ],
+ "source": [
+ "input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])\n",
+ "target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])\n",
+ "\n",
+ "print(input_token_index)\n",
+ "print(target_token_index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 199,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# encoder_input_data is a 3D array of shape (num_pairs, max input seq length, num input characters)\n",
+ "encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')\n",
+ "\n",
+ "# decoder_input_data is a 3D array of shape (num_pairs, max target seq length, num target characters)\n",
+ "decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')\n",
+ "\n",
+ "# decoder_target_data is the same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :].\n",
+ "decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Building our model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 200,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# i = training examples\n",
+ "# t = time step\n",
+ "# c = set the position representing the character to 1 (one hot encoded character)\n",
+ "\n",
+ "for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):\n",
+ " for t, char in enumerate(input_text):\n",
+ " encoder_input_data[i, t, input_token_index[char]] = 1.\n",
+ " for t, char in enumerate(target_text):\n",
+ " # decoder_target_data is ahead of decoder_input_data by one timestep\n",
+ " decoder_input_data[i, t, target_token_index[char]] = 1.\n",
+ " if t > 0:\n",
+ " # decoder_target_data will be ahead by one timestep\n",
+ " # and will not include the start character.\n",
+ " decoder_target_data[i, t - 1, target_token_index[char]] = 1."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 202,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "latent_dim = 256\n",
+ "\n",
+ "# Define an input sequence and process it.\n",
+ "encoder_inputs = Input(shape=(None, num_encoder_tokens))\n",
+ "encoder = LSTM(latent_dim, return_state=True)\n",
+ "encoder_outputs, state_h, state_c = encoder(encoder_inputs)\n",
+ "\n",
+ "# We discard `encoder_outputs` and only keep the states.\n",
+ "encoder_states = [state_h, state_c]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 203,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set up the decoder, using `encoder_states` as initial state.\n",
+ "decoder_inputs = Input(shape=(None, num_decoder_tokens))\n",
+ "\n",
+ "# We set up our decoder to return full output sequences,\n",
+ "# and to return internal states as well. We don't use the\n",
+ "# return states in the training model, but we will use them in inference.\n",
+ "decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)\n",
+ "decoder_outputs, _, _ = decoder_lstm(decoder_inputs,\n",
+ " initial_state=encoder_states)\n",
+ "\n",
+ "decoder_dense = Dense(num_decoder_tokens, activation='softmax')\n",
+ "decoder_outputs = decoder_dense(decoder_outputs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 204,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define the model that will turn\n",
+ "# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`\n",
+ "model = Model([encoder_inputs, decoder_inputs], decoder_outputs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 205,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train on 273760 samples, validate on 68440 samples\n",
+ "Epoch 1/100\n",
+ "102336/273760 [==========>...................] - ETA: 6:04 - loss: 0.9042"
+ ]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mepochs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m validation_split=0.2)\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;31m# Save model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m's2s.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[1;32m 1703\u001b[0m \u001b[0minitial_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1704\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1705\u001b[0;31m validation_steps=validation_steps)\n\u001b[0m\u001b[1;32m 1706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1707\u001b[0m def evaluate(self, x=None, y=None,\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_fit_loop\u001b[0;34m(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[1;32m 1233\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1234\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1235\u001b[0;31m \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1236\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1237\u001b[0m \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 2476\u001b[0m \u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2477\u001b[0m updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[0;32m-> 2478\u001b[0;31m **self.session_kwargs)\n\u001b[0m\u001b[1;32m 2479\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2480\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 903\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 904\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 905\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 906\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 907\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1136\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m-> 1137\u001b[0;31m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[1;32m 1138\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1139\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1353\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1354\u001b[0m return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[0;32m-> 1355\u001b[0;31m options, run_metadata)\n\u001b[0m\u001b[1;32m 1356\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1357\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1359\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1360\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1363\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1338\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1339\u001b[0m return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[0;32m-> 1340\u001b[0;31m target_list, status, run_metadata)\n\u001b[0m\u001b[1;32m 1341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1342\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "batch_size = 64 # Batch size for training.\n",
+ "epochs = 100 # Number of epochs to train for.\n",
+ "\n",
+ "# Run training\n",
+ "model.compile(optimizer='rmsprop', loss='categorical_crossentropy')\n",
+ "model.fit([encoder_input_data, decoder_input_data], decoder_target_data,\n",
+ " batch_size=batch_size,\n",
+ " epochs=epochs,\n",
+ " validation_split=0.2)\n",
+ "# Save model\n",
+ "model.save('s2s.h5')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/2. Learning about Trade Data.ipynb b/2. Learning about Trade Data.ipynb
index fff7aaa..b92c302 100644
--- a/2. Learning about Trade Data.ipynb
+++ b/2. Learning about Trade Data.ipynb
@@ -4024,7 +4024,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.4"
+ "version": "3.6.5"
}
},
"nbformat": 4,
diff --git a/3a. Taxon Autocorrect with LSTM Autoencoders.ipynb b/3a. Taxon Autocorrect with LSTM Autoencoders.ipynb
new file mode 100644
index 0000000..a6dc5f4
--- /dev/null
+++ b/3a. Taxon Autocorrect with LSTM Autoencoders.ipynb
@@ -0,0 +1,1447 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3a. Autocorrecting Mispelt Taxon Names with Autoencoders\n",
+ "Given a list of taxon names, can we build an autocorrect model to autonomously fix erroneous records?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 184,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib\n",
+ "import random\n",
+ "import string\n",
+ "from keras.models import Model\n",
+ "from keras.preprocessing import sequence\n",
+ "from keras.layers import Input, LSTM, Dense"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Exploring the dataset\n",
+ "We'll use the same dataset as last time; a publically available list of UK exports from 1975 - 2016. We'll only need the taxon names so we'll restrict our import to the taxon column."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 185,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Taxon | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Varanus flavescens | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Varanus griseus | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Branta ruficollis | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Leopardus pardalis | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Leopardus wiedii | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Diceros bicornis | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Asarcornis scutulata | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Branta sandvicensis | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Branta sandvicensis | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Cercopithecus diana | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Rucervus duvaucelii | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Crocodylus siamensis | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Elephas maximus | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Falco peregrinus | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Acinonyx jubatus | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Catopuma temminckii | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " Leopardus jacobitus | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Leopardus pardalis mearnsi | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Panthera onca | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 49339 | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 49340 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49341 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49342 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49343 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49344 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49345 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49346 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49347 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49348 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49349 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49350 | \n",
+ " Mustela sibirica | \n",
+ "
\n",
+ " \n",
+ " 49351 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49352 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49353 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49354 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49355 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49356 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49357 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49358 | \n",
+ " Odobenus rosmarus | \n",
+ "
\n",
+ " \n",
+ " 49359 | \n",
+ " Lodoicea maldivica | \n",
+ "
\n",
+ " \n",
+ " 49360 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49361 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49362 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49363 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49364 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49365 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49366 | \n",
+ " Pavo cristatus | \n",
+ "
\n",
+ " \n",
+ " 49367 | \n",
+ " Alligator mississippiensis | \n",
+ "
\n",
+ " \n",
+ " 49368 | \n",
+ " Varanus salvator | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
49369 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Taxon\n",
+ "0 Equus przewalskii\n",
+ "1 Panthera onca\n",
+ "2 Varanus flavescens\n",
+ "3 Varanus griseus\n",
+ "4 Branta ruficollis\n",
+ "5 Leopardus pardalis\n",
+ "6 Leopardus wiedii\n",
+ "7 Diceros bicornis\n",
+ "8 Asarcornis scutulata\n",
+ "9 Branta sandvicensis\n",
+ "10 Branta sandvicensis\n",
+ "11 Cercopithecus diana\n",
+ "12 Rucervus duvaucelii\n",
+ "13 Crocodylus siamensis\n",
+ "14 Elephas maximus\n",
+ "15 Elephas maximus\n",
+ "16 Elephas maximus\n",
+ "17 Elephas maximus\n",
+ "18 Equus przewalskii\n",
+ "19 Falco peregrinus\n",
+ "20 Acinonyx jubatus\n",
+ "21 Catopuma temminckii\n",
+ "22 Leopardus jacobitus\n",
+ "23 Leopardus pardalis mearnsi\n",
+ "24 Panthera onca\n",
+ "25 Panthera onca\n",
+ "26 Panthera onca\n",
+ "27 Panthera onca\n",
+ "28 Panthera onca\n",
+ "29 Panthera onca\n",
+ "... ...\n",
+ "49339 Martes flavigula\n",
+ "49340 Mustela sibirica\n",
+ "49341 Mustela sibirica\n",
+ "49342 Mustela sibirica\n",
+ "49343 Mustela sibirica\n",
+ "49344 Mustela sibirica\n",
+ "49345 Mustela sibirica\n",
+ "49346 Mustela sibirica\n",
+ "49347 Mustela sibirica\n",
+ "49348 Mustela sibirica\n",
+ "49349 Mustela sibirica\n",
+ "49350 Mustela sibirica\n",
+ "49351 Odobenus rosmarus\n",
+ "49352 Odobenus rosmarus\n",
+ "49353 Odobenus rosmarus\n",
+ "49354 Odobenus rosmarus\n",
+ "49355 Odobenus rosmarus\n",
+ "49356 Odobenus rosmarus\n",
+ "49357 Odobenus rosmarus\n",
+ "49358 Odobenus rosmarus\n",
+ "49359 Lodoicea maldivica\n",
+ "49360 Pavo cristatus\n",
+ "49361 Pavo cristatus\n",
+ "49362 Pavo cristatus\n",
+ "49363 Pavo cristatus\n",
+ "49364 Pavo cristatus\n",
+ "49365 Pavo cristatus\n",
+ "49366 Pavo cristatus\n",
+ "49367 Alligator mississippiensis\n",
+ "49368 Varanus salvator\n",
+ "\n",
+ "[49369 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 185,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataframe = pd.read_csv(\"data/goal_2_data.csv\", skipinitialspace=True, usecols=[\"Taxon\"])\n",
+ "\n",
+ "dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 186,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Loxodonta africana 3606\n",
+ "Python reticulatus 1523\n",
+ "Alligator mississippiensis 1297\n",
+ "Macaca fascicularis 1279\n",
+ "Varanus salvator 972\n",
+ "Elephas maximus 952\n",
+ "Cheloniidae spp. 866\n",
+ "Varanus niloticus 744\n",
+ "Elephantidae spp. 716\n",
+ "Crocodylus niloticus 685\n",
+ "Psittacus erithacus 632\n",
+ "Crocodylus porosus 560\n",
+ "Caiman crocodilus crocodilus 524\n",
+ "Python bivittatus 501\n",
+ "Ptyas mucosus 473\n",
+ "Chlorocebus aethiops 457\n",
+ "Falco peregrinus 403\n",
+ "Eretmochelys imbricata 393\n",
+ "Dalbergia nigra 354\n",
+ "Vicugna vicugna 337\n",
+ "Panthera pardus 325\n",
+ "Callithrix jacchus 323\n",
+ "Odobenus rosmarus 299\n",
+ "Falco rusticolus 296\n",
+ "Panthera tigris 283\n",
+ "Physeter macrocephalus 255\n",
+ "Hirudo medicinalis 249\n",
+ "Macaca mulatta 232\n",
+ "Crocodylus novaeguineae 202\n",
+ "Leopardus pardalis 201\n",
+ " ... \n",
+ "Micrastur ruficollis 1\n",
+ "Hydnophora spp. 1\n",
+ "Lycaste fulvescens 1\n",
+ "Errinopora pourtalesii 1\n",
+ "Maihueniopsis darwinii 1\n",
+ "Porites divaricata 1\n",
+ "Aloe trachyticola 1\n",
+ "Polemaetus bellicosus 1\n",
+ "Sternbergia candida 1\n",
+ "Errinopora spp. 1\n",
+ "Dracula tubeana 1\n",
+ "Chinchilla lanigera 1\n",
+ "Peniocereus spp. 1\n",
+ "Mesoplodon europaeus 1\n",
+ "Cypripedium yunnanense 1\n",
+ "Nectophrynoides minutus 1\n",
+ "Vidua paradisaea 1\n",
+ "Bulbophyllum resupinatum 1\n",
+ "Turbinicarpus mandragora 1\n",
+ "Dalbergia retusa 1\n",
+ "Pristis spp. 1\n",
+ "Masdevallia andreettaeana 1\n",
+ "Dendrobium violaceum 1\n",
+ "Favites abdita 1\n",
+ "Astrophytum myriostigma 1\n",
+ "Epiphyllum pumilum 1\n",
+ "Pterostylis fischii 1\n",
+ "Colpophyllia amaranthus 1\n",
+ "Acineta chrysantha 1\n",
+ "Anas spp. 1\n",
+ "Name: Taxon, Length: 3422, dtype: int64"
+ ]
+ },
+ "execution_count": 186,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "name_distribution = dataframe[\"Taxon\"].value_counts()\n",
+ "name_distribution"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 187,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 187,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "name_distribution.head(50).plot.bar(figsize=(10, 10), title=\"Top 10 Taxon Names\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Obviously we can't use this for our training set as our model would disproportionatly learn to correct everything to \"Loxodonta africana\". We'll have to create a dataset of unique names..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 188,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total taxon names: 3422\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array(['Equus przewalskii', 'Panthera onca', 'Varanus flavescens', ...,\n",
+ " 'Phaethornis longirostris', 'Mesoplodon stejnegeri',\n",
+ " 'Martes flavigula'], dtype=object)"
+ ]
+ },
+ "execution_count": 188,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "names = dataframe[\"Taxon\"].unique()\n",
+ "\n",
+ "print(\"Total taxon names: \", len(names))\n",
+ "names"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we have a list of 3422 names that we can train our model on, but we'll need to generate some fake spelling mistakes first, so let's write a function to do that..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 189,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Lxodonta afkicana\n"
+ ]
+ }
+ ],
+ "source": [
+ "def spelling_mistake_generator(name):\n",
+ " n = random.randint(0,4)\n",
+ " if n == 0:\n",
+ " return remove_letter(name)\n",
+ " elif n == 1:\n",
+ " return add_letter(name)\n",
+ " elif n == 2:\n",
+ " return swap_letters(name)\n",
+ " elif n == 3:\n",
+ " name = remove_letter(name)\n",
+ " return add_letter(name)\n",
+ " elif n == 4:\n",
+ " return lowercase(name)\n",
+ "\n",
+ "def remove_letter(name):\n",
+ " random_slice = random.randint(0, len(name))\n",
+ " generated_name = name[:random_slice] + name[(random_slice + 1):]\n",
+ " return generated_name\n",
+ "\n",
+ "def add_letter(name):\n",
+ " random_slice = random.randint(0, len(name))\n",
+ " random_letter = random.choice(string.ascii_letters)\n",
+ " generated_name = name[:random_slice] + random_letter + name[(random_slice + 1):]\n",
+ " return generated_name\n",
+ "\n",
+ "def swap_letters(name):\n",
+ " random_slice = random.randint(0, len(name) - 2)\n",
+ " generated_name = name[:random_slice] + reversed_string(name[random_slice:random_slice + 2]) + name[random_slice + 2:]\n",
+ " return generated_name\n",
+ " \n",
+ "def reversed_string(a_string):\n",
+ " return a_string[::-1]\n",
+ "\n",
+ "def lowercase(name):\n",
+ " return name.lower()\n",
+ "\n",
+ "print(spelling_mistake_generator(\"Loxodonta africana\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 190,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loxodonta aLricana\n",
+ "Looxdonta africana\n",
+ "loxodonta africana\n",
+ "Loxodnta afrTcana\n",
+ "loxodonta africana\n",
+ "Loxodonta africMna\n",
+ "Loxodontaafricana\n",
+ "Loxodonta afrianaZ\n",
+ "Loxodonta africaq\n",
+ "loxodonta africana\n"
+ ]
+ }
+ ],
+ "source": [
+ "for i in range(10):\n",
+ " print(spelling_mistake_generator(\"Loxodonta africana\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Generating our Rosetta Stone\n",
+ "\n",
+ "We'll use our new spelling mistake generator to generate a new dataset where we have the erroneous data in one column, and the correct data next to it. Since we get a different mistake each time we run the generator, we'll create 100 examples of each term..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 191,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Input | \n",
+ " Target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Equus przeawlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Equs przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Equusprzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Equus przealskiis | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Equus prUewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Equus prlewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Equus przewlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " EquusPprzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Eqlus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Equus pzrewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Equus przeawlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Equusprzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Equus przewlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Equus przewalskji | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Equs przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Equus przewlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Equus pzrewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Equus przewalsNii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Equus przewalsiki | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Equus przealskiu | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Equus przeawlskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Equus przealWkii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Equusp rzewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Equus prEewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " equus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " squus przewalskii | \n",
+ " Equus przewalskii | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 342170 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342171 | \n",
+ " Martes fwavgula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342172 | \n",
+ " Martek flvigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342173 | \n",
+ " Martes fwvigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342174 | \n",
+ " Martes flaigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342175 | \n",
+ " Martes flavigul | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342176 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342177 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342178 | \n",
+ " Martes flaJigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342179 | \n",
+ " partes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342180 | \n",
+ " Martes lfavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342181 | \n",
+ " MarteP flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342182 | \n",
+ " Martes flvibula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342183 | \n",
+ " Martse flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342184 | \n",
+ " Martes flaviula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342185 | \n",
+ " Martes flavigual | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342186 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342187 | \n",
+ " Marts flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342188 | \n",
+ " Martesflavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342189 | \n",
+ " Martes flNvigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342190 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342191 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342192 | \n",
+ " Maxtes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342193 | \n",
+ " Martes flavigulaa | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342194 | \n",
+ " Martes flavigXla | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342195 | \n",
+ " aMrtes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342196 | \n",
+ " MartesLflavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342197 | \n",
+ " Marets flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342198 | \n",
+ " Martes flaviula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ " 342199 | \n",
+ " martes flavigula | \n",
+ " Martes flavigula | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
342200 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Input Target\n",
+ "0 Equus przeawlskii Equus przewalskii\n",
+ "1 Equs przewalskii Equus przewalskii\n",
+ "2 Equusprzewalskii Equus przewalskii\n",
+ "3 Equus przealskiis Equus przewalskii\n",
+ "4 Equus prUewalskii Equus przewalskii\n",
+ "5 Equus prlewalskii Equus przewalskii\n",
+ "6 equus przewalskii Equus przewalskii\n",
+ "7 Equus przewlskii Equus przewalskii\n",
+ "8 EquusPprzewalskii Equus przewalskii\n",
+ "9 Eqlus przewalskii Equus przewalskii\n",
+ "10 Equus pzrewalskii Equus przewalskii\n",
+ "11 Equus przeawlskii Equus przewalskii\n",
+ "12 Equusprzewalskii Equus przewalskii\n",
+ "13 Equus przewlskii Equus przewalskii\n",
+ "14 Equus przewalskji Equus przewalskii\n",
+ "15 Equs przewalskii Equus przewalskii\n",
+ "16 Equus przewlskii Equus przewalskii\n",
+ "17 Equus pzrewalskii Equus przewalskii\n",
+ "18 equus przewalskii Equus przewalskii\n",
+ "19 Equus przewalsNii Equus przewalskii\n",
+ "20 Equus przewalsiki Equus przewalskii\n",
+ "21 Equus przealskiu Equus przewalskii\n",
+ "22 equus przewalskii Equus przewalskii\n",
+ "23 Equus przeawlskii Equus przewalskii\n",
+ "24 Equus przealWkii Equus przewalskii\n",
+ "25 equus przewalskii Equus przewalskii\n",
+ "26 Equusp rzewalskii Equus przewalskii\n",
+ "27 Equus prEewalskii Equus przewalskii\n",
+ "28 equus przewalskii Equus przewalskii\n",
+ "29 squus przewalskii Equus przewalskii\n",
+ "... ... ...\n",
+ "342170 martes flavigula Martes flavigula\n",
+ "342171 Martes fwavgula Martes flavigula\n",
+ "342172 Martek flvigula Martes flavigula\n",
+ "342173 Martes fwvigula Martes flavigula\n",
+ "342174 Martes flaigula Martes flavigula\n",
+ "342175 Martes flavigul Martes flavigula\n",
+ "342176 martes flavigula Martes flavigula\n",
+ "342177 martes flavigula Martes flavigula\n",
+ "342178 Martes flaJigula Martes flavigula\n",
+ "342179 partes flavigula Martes flavigula\n",
+ "342180 Martes lfavigula Martes flavigula\n",
+ "342181 MarteP flavigula Martes flavigula\n",
+ "342182 Martes flvibula Martes flavigula\n",
+ "342183 Martse flavigula Martes flavigula\n",
+ "342184 Martes flaviula Martes flavigula\n",
+ "342185 Martes flavigual Martes flavigula\n",
+ "342186 martes flavigula Martes flavigula\n",
+ "342187 Marts flavigula Martes flavigula\n",
+ "342188 Martesflavigula Martes flavigula\n",
+ "342189 Martes flNvigula Martes flavigula\n",
+ "342190 martes flavigula Martes flavigula\n",
+ "342191 martes flavigula Martes flavigula\n",
+ "342192 Maxtes flavigula Martes flavigula\n",
+ "342193 Martes flavigulaa Martes flavigula\n",
+ "342194 Martes flavigXla Martes flavigula\n",
+ "342195 aMrtes flavigula Martes flavigula\n",
+ "342196 MartesLflavigula Martes flavigula\n",
+ "342197 Marets flavigula Martes flavigula\n",
+ "342198 Martes flaviula Martes flavigula\n",
+ "342199 martes flavigula Martes flavigula\n",
+ "\n",
+ "[342200 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 191,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "repeated_names = np.repeat(names, 100)\n",
+ "spelling_errors = [spelling_mistake_generator(s) for s in repeated_names]\n",
+ "\n",
+ "corpus = np.column_stack((spelling_errors, repeated_names))\n",
+ "\n",
+ "corpus = pd.DataFrame(corpus, columns=[\"Input\", \"Target\"])\n",
+ "corpus"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We've ended up with 342200 records of names with minor typos and formatting mistakes. (You can change the number of repetitions from 100 to 1000 if you need more data, but I've left this as 100 to save time. 1000 and upwards takes a little while to generate)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Vectorise our data for learning\n",
+ "\n",
+ "We'll need to encode our data from letters to number for our model to be able to deal with it. We'll take our table, and create two lists, one of all the characters in our input dataset, and one of all the characters in our target dataset. We can use these later on to one hot encode our characters to vectors before we feed them to our model. We'll also add a start and end character to our target data as this will be useful for our model to understand when to start and stop generating..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 192,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "input_texts = []\n",
+ "target_texts = []\n",
+ "\n",
+ "start_character = '\\t'\n",
+ "end_character = '\\n'\n",
+ "\n",
+ "input_characters = set()\n",
+ "target_characters = set()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 193,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Takes in the input and target texts and adds their characters to the list of input and target characters\n",
+ "\n",
+ "def build_character_lists(input_text, target_text):\n",
+ " for char in input_text:\n",
+ " if char not in input_characters:\n",
+ " input_characters.add(char)\n",
+ " for char in target_text:\n",
+ " if char not in target_characters:\n",
+ " target_characters.add(char)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 194,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for index, row in corpus.iterrows():\n",
+ " input_text = row[\"Input\"]\n",
+ " target_text = row[\"Target\"]\n",
+ " target_text = start_character + target_text + end_character\n",
+ " build_character_lists(input_text, target_text)\n",
+ " input_texts.append(input_text)\n",
+ " target_texts.append(target_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 195,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of samples: 342200\n",
+ "Number of unique input tokens: 55\n",
+ "Number of unique output tokens: 56\n",
+ "Max sequence length for inputs: 36\n",
+ "Max sequence length for outputs: 38\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['\\t',\n",
+ " '\\n',\n",
+ " ' ',\n",
+ " '-',\n",
+ " '.',\n",
+ " 'A',\n",
+ " 'B',\n",
+ " 'C',\n",
+ " 'D',\n",
+ " 'E',\n",
+ " 'F',\n",
+ " 'G',\n",
+ " 'H',\n",
+ " 'I',\n",
+ " 'J',\n",
+ " 'K',\n",
+ " 'L',\n",
+ " 'M',\n",
+ " 'N',\n",
+ " 'O',\n",
+ " 'P',\n",
+ " 'Q',\n",
+ " 'R',\n",
+ " 'S',\n",
+ " 'T',\n",
+ " 'U',\n",
+ " 'V',\n",
+ " 'W',\n",
+ " 'X',\n",
+ " 'Z',\n",
+ " 'a',\n",
+ " 'b',\n",
+ " 'c',\n",
+ " 'd',\n",
+ " 'e',\n",
+ " 'f',\n",
+ " 'g',\n",
+ " 'h',\n",
+ " 'i',\n",
+ " 'j',\n",
+ " 'k',\n",
+ " 'l',\n",
+ " 'm',\n",
+ " 'n',\n",
+ " 'o',\n",
+ " 'p',\n",
+ " 'q',\n",
+ " 'r',\n",
+ " 's',\n",
+ " 't',\n",
+ " 'u',\n",
+ " 'v',\n",
+ " 'w',\n",
+ " 'x',\n",
+ " 'y',\n",
+ " 'z']"
+ ]
+ },
+ "execution_count": 195,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "input_characters = sorted(list(input_characters))\n",
+ "target_characters = sorted(list(target_characters))\n",
+ "num_encoder_tokens = len(input_characters)\n",
+ "num_decoder_tokens = len(target_characters)\n",
+ "max_encoder_seq_length = max([len(txt) for txt in input_texts])\n",
+ "max_decoder_seq_length = max([len(txt) for txt in target_texts])\n",
+ "\n",
+ "print('Number of samples:', len(input_texts))\n",
+ "print('Number of unique input tokens:', num_encoder_tokens)\n",
+ "print('Number of unique output tokens:', num_decoder_tokens)\n",
+ "print('Max sequence length for inputs:', max_encoder_seq_length)\n",
+ "print('Max sequence length for outputs:', max_decoder_seq_length)\n",
+ "\n",
+ "target_characters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We'll create two dictionaries to help us get from our characters to numbers and back for both our input and target dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 198,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{' ': 0, '-': 1, '.': 2, 'A': 3, 'B': 4, 'C': 5, 'D': 6, 'E': 7, 'F': 8, 'G': 9, 'H': 10, 'I': 11, 'J': 12, 'K': 13, 'L': 14, 'M': 15, 'N': 16, 'O': 17, 'P': 18, 'Q': 19, 'R': 20, 'S': 21, 'T': 22, 'U': 23, 'V': 24, 'W': 25, 'X': 26, 'Y': 27, 'Z': 28, 'a': 29, 'b': 30, 'c': 31, 'd': 32, 'e': 33, 'f': 34, 'g': 35, 'h': 36, 'i': 37, 'j': 38, 'k': 39, 'l': 40, 'm': 41, 'n': 42, 'o': 43, 'p': 44, 'q': 45, 'r': 46, 's': 47, 't': 48, 'u': 49, 'v': 50, 'w': 51, 'x': 52, 'y': 53, 'z': 54}\n",
+ "{'\\t': 0, '\\n': 1, ' ': 2, '-': 3, '.': 4, 'A': 5, 'B': 6, 'C': 7, 'D': 8, 'E': 9, 'F': 10, 'G': 11, 'H': 12, 'I': 13, 'J': 14, 'K': 15, 'L': 16, 'M': 17, 'N': 18, 'O': 19, 'P': 20, 'Q': 21, 'R': 22, 'S': 23, 'T': 24, 'U': 25, 'V': 26, 'W': 27, 'X': 28, 'Z': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}\n"
+ ]
+ }
+ ],
+ "source": [
+ "input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])\n",
+ "target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])\n",
+ "\n",
+ "print(input_token_index)\n",
+ "print(target_token_index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 199,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# encoder_input_data is a 3D array of shape (num_pairs, max input seq length, num input characters)\n",
+ "encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')\n",
+ "\n",
+ "# decoder_input_data is a 3D array of shape (num_pairs, max target seq length, num target characters)\n",
+ "decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')\n",
+ "\n",
+ "# decoder_target_data is the same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :].\n",
+ "decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Building our model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 200,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# i = training examples\n",
+ "# t = time step\n",
+ "# c = set the position representing the character to 1 (one hot encoded character)\n",
+ "\n",
+ "for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):\n",
+ " for t, char in enumerate(input_text):\n",
+ " encoder_input_data[i, t, input_token_index[char]] = 1.\n",
+ " for t, char in enumerate(target_text):\n",
+ " # decoder_target_data is ahead of decoder_input_data by one timestep\n",
+ " decoder_input_data[i, t, target_token_index[char]] = 1.\n",
+ " if t > 0:\n",
+ " # decoder_target_data will be ahead by one timestep\n",
+ " # and will not include the start character.\n",
+ " decoder_target_data[i, t - 1, target_token_index[char]] = 1."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 202,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "latent_dim = 256\n",
+ "\n",
+ "# Define an input sequence and process it.\n",
+ "encoder_inputs = Input(shape=(None, num_encoder_tokens))\n",
+ "encoder = LSTM(latent_dim, return_state=True)\n",
+ "encoder_outputs, state_h, state_c = encoder(encoder_inputs)\n",
+ "\n",
+ "# We discard `encoder_outputs` and only keep the states.\n",
+ "encoder_states = [state_h, state_c]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 203,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set up the decoder, using `encoder_states` as initial state.\n",
+ "decoder_inputs = Input(shape=(None, num_decoder_tokens))\n",
+ "\n",
+ "# We set up our decoder to return full output sequences,\n",
+ "# and to return internal states as well. We don't use the\n",
+ "# return states in the training model, but we will use them in inference.\n",
+ "decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)\n",
+ "decoder_outputs, _, _ = decoder_lstm(decoder_inputs,\n",
+ " initial_state=encoder_states)\n",
+ "\n",
+ "decoder_dense = Dense(num_decoder_tokens, activation='softmax')\n",
+ "decoder_outputs = decoder_dense(decoder_outputs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 204,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define the model that will turn\n",
+ "# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`\n",
+ "model = Model([encoder_inputs, decoder_inputs], decoder_outputs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 206,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train on 273760 samples, validate on 68440 samples\n",
+ "Epoch 1/100\n",
+ "273760/273760 [==============================] - 743s 3ms/step - loss: 0.2597 - val_loss: 1.3135\n",
+ "Epoch 2/100\n",
+ "273760/273760 [==============================] - 702s 3ms/step - loss: 0.0587 - val_loss: 1.5200\n",
+ "Epoch 3/100\n",
+ "273760/273760 [==============================] - 658s 2ms/step - loss: 0.0232 - val_loss: 1.6334\n",
+ "Epoch 4/100\n",
+ "273760/273760 [==============================] - 655s 2ms/step - loss: 0.0141 - val_loss: 1.7563\n",
+ "Epoch 5/100\n",
+ "273760/273760 [==============================] - 648s 2ms/step - loss: 0.0101 - val_loss: 1.7978\n",
+ "Epoch 6/100\n",
+ "273760/273760 [==============================] - 648s 2ms/step - loss: 0.0078 - val_loss: 1.8374\n",
+ "Epoch 7/100\n",
+ "273760/273760 [==============================] - 645s 2ms/step - loss: 0.0062 - val_loss: 1.8761\n",
+ "Epoch 8/100\n",
+ "273760/273760 [==============================] - 658s 2ms/step - loss: 0.0052 - val_loss: 1.9187\n",
+ "Epoch 9/100\n",
+ "273760/273760 [==============================] - 779s 3ms/step - loss: 0.0045 - val_loss: 1.9649\n",
+ "Epoch 10/100\n",
+ "273760/273760 [==============================] - 706s 3ms/step - loss: 0.0039 - val_loss: 1.9650\n",
+ "Epoch 11/100\n",
+ "273760/273760 [==============================] - 640s 2ms/step - loss: 0.0035 - val_loss: 1.9923\n",
+ "Epoch 12/100\n",
+ "242432/273760 [=========================>....] - ETA: 1:06 - loss: 0.0032"
+ ]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mepochs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m validation_split=0.2)\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;31m# Save model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m's2s.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[1;32m 1703\u001b[0m \u001b[0minitial_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1704\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1705\u001b[0;31m validation_steps=validation_steps)\n\u001b[0m\u001b[1;32m 1706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1707\u001b[0m def evaluate(self, x=None, y=None,\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_fit_loop\u001b[0;34m(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[1;32m 1233\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1234\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1235\u001b[0;31m \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1236\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1237\u001b[0m \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 2476\u001b[0m \u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2477\u001b[0m updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[0;32m-> 2478\u001b[0;31m **self.session_kwargs)\n\u001b[0m\u001b[1;32m 2479\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2480\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 903\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 904\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 905\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 906\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 907\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1136\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m-> 1137\u001b[0;31m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[1;32m 1138\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1139\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1353\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1354\u001b[0m return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[0;32m-> 1355\u001b[0;31m options, run_metadata)\n\u001b[0m\u001b[1;32m 1356\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1357\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1359\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1360\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1363\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1338\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1339\u001b[0m return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[0;32m-> 1340\u001b[0;31m target_list, status, run_metadata)\n\u001b[0m\u001b[1;32m 1341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1342\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "batch_size = 64 # Batch size for training.\n",
+ "epochs = 100 # Number of epochs to train for.\n",
+ "\n",
+ "# Run training\n",
+ "model.compile(optimizer='rmsprop', loss='categorical_crossentropy')\n",
+ "model.fit([encoder_input_data, decoder_input_data], decoder_target_data,\n",
+ " batch_size=batch_size,\n",
+ " epochs=epochs,\n",
+ " validation_split=0.2)\n",
+ "# Save model\n",
+ "model.save('s2s.h5')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}