FluxML · chinglamchoi · Jan 4, 2020 · Jan 5, 2020 · Jan 5, 2020 · Jan 5, 2020
diff --git a/text/char-rnn/char-rnn.ipynb b/text/char-rnn/char-rnn.ipynb
@@ -0,0 +1,180 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "markdown",
+   "source": [
+    "# Character-level Recurrent Neural Network"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "markdown",
+   "source": [
+    "# 1. Import Dependencies"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Flux\n",
+    "using Flux: onehot, chunk, batchseq, throttle, crossentropy\n",
+    "using StatsBase: wsample\n",
+    "using Base.Iterators: partition"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "outputs": [],
+   "cell_type": "markdown",
+   "source": [
+    "# 2. Data Download & Pre-processing\n",
+    "- Source of data: Shakespeare text from https://cs.stanford.edu/people/karpathy/char-rnn/\n",
+    "- Generate character tokens\n",
+    "- Partition in batches for input"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "cd(@__DIR__)\n",
+    "\n",
+    "isfile(\"input.txt\") ||\n",
+    "  download(\"https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt\",\n",
+    "           \"input.txt\")\n",
+    "\n",
+    "#Generate array of all chars appearing in input.txt, let total num be N:\n",
+    "text = collect(String(read(\"input.txt\")))\n",
+    "alphabet = [unique(text)..., '_'] #get unique char array\n",
+    "#Generate array of one-hot vectors for each character in the text.\n",
+    "#Each vector has N-elements, where 1 element in N is set to true (others: false):\n",
+    "text = map(ch -> onehot(ch, alphabet), text)\n",
+    "stop = onehot('_', alphabet) #generate end token\n",
+    "\n",
+    "N = length(alphabet)\n",
+    "seqlen = 50 #batch size\n",
+    "nbatch = 50 #number of batches\n",
+    "\n",
+    "Xs = collect(partition(batchseq(chunk(text, nbatch), stop), seqlen)) #get array of minibatches of input x\n",
+    "Ys = collect(partition(batchseq(chunk(text[2:end], nbatch), stop), seqlen)) #get array of minibatches of \"label\" y"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "outputs": [],
+   "cell_type": "markdown",
+   "source": [
+    "# 3. Define RNN Model, Hyperparameters"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "#Flux's chain function joins multiple layers together, such that layer operations are performed on input sequentially.\n",
+    "m = Chain(\n",
+    "  LSTM(N, 128), #Long Short-term Memory of feature space size 128\n",
+    "  LSTM(128, 128), # output is 128-dimensional\n",
+    "  Dense(128, N), #N = number of possible tokens\n",
+    "  softmax) #calculate the probability of output char corr. to each possible char\n",
+    "\n",
+    "m = gpu(m) #use GPU acceleration\n",
+    "\n",
+    "function loss(xs, ys) #CE loss, or log loss quanitfies the performance of models with probability output\n",
+    "  l = sum(crossentropy.(m.(gpu.(xs)), gpu.(ys))) #pass to GPU and get cost\n",
+    "  Flux.truncate!(m)\n",
+    "  return l\n",
+    "end\n",
+    "\n",
+    "opt = ADAM(0.01) #use the ADAM optimiser with learning rate of 0.01\n",
+    "tx, ty = (Xs[5], Ys[5])\n",
+    "evalcb = () -> @show loss(tx, ty)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "outputs": [],
+   "cell_type": "markdown",
+   "source": [
+    "# 4. Train model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "Flux.train!(loss, params(m), zip(Xs, Ys), opt,\n",
+    "            cb = throttle(evalcb, 30)) #timeout for 30 secs"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "outputs": [],
+   "cell_type": "markdown",
+   "source": [
+    "# 5. Sample from input.txt and test model\n",
+    "Compose a 1000-char long verse in the style of Shakespeare!"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function sample(m, alphabet, len)\n",
+    "  m = cpu(m) #use cpu as gpu offers minimal acc for seq models\n",
+    "  Flux.reset!(m)\n",
+    "  buf = IOBuffer()\n",
+    "  c = rand(alphabet) #take random input char token\n",
+    "  for i = 1:len\n",
+    "    write(buf, c)\n",
+    "    #Compose like Shakespeare char-by-char! :\n",
+    "    c = wsample(alphabet, m(onehot(c, alphabet)).data)\n",
+    "  end\n",
+    "  return String(take!(buf)) #get results from last LSTM hidden state\n",
+    "end\n",
+    "\n",
+    "#Print results\n",
+    "sample(m, alphabet, 1000) |> println"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "outputs": [],
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.3.0"
+  },
+  "kernelspec": {
+   "name": "julia-1.3",
+   "display_name": "Julia 1.3.0",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/text/char-rnn/char-rnn.jl b/text/char-rnn/char-rnn.jl
@@ -1,59 +1,80 @@
+# # Character-level Recurrent Neural Network
+#- Train model on Shakespeare's works
+#- Have model write like Shakespeare at the end
+
+# # 1. Import Dependencies
+
 using Flux
 using Flux: onehot, chunk, batchseq, throttle, crossentropy
 using StatsBase: wsample
 using Base.Iterators: partition
 
+# # 2. Data Download & Pre-processing
+# - Source of data: Shakespeare text from https://cs.stanford.edu/people/karpathy/char-rnn/
+# - Generate character tokens
+# - Partition in batches for input
 cd(@__DIR__)
 
 isfile("input.txt") ||
   download("https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt",
            "input.txt")
 
+#Generate array of all chars appearing in input.txt, let total num be N:
 text = collect(String(read("input.txt")))
-alphabet = [unique(text)..., '_']
+alphabet = [unique(text)..., '_'] #get unique char array
+#Generate array of one-hot vectors for each character in the text. 
+#Each vector has N-elements, where 1 element in N is set to true (others: false):
 text = map(ch -> onehot(ch, alphabet), text)
-stop = onehot('_', alphabet)
+stop = onehot('_', alphabet) #generate end token
 
 N = length(alphabet)
-seqlen = 50
-nbatch = 50
+seqlen = 50 #batch size
+nbatch = 50 #number of batches
 
-Xs = collect(partition(batchseq(chunk(text, nbatch), stop), seqlen))
-Ys = collect(partition(batchseq(chunk(text[2:end], nbatch), stop), seqlen))
+Xs = collect(partition(batchseq(chunk(text, nbatch), stop), seqlen)) #get array of minibatches of input x
+Ys = collect(partition(batchseq(chunk(text[2:end], nbatch), stop), seqlen)) #get array of minibatches of "label" y
 
+# # 3. Define RNN Model, Hyperparameters
+#- Define 4-layer deep RNN
+#- Define loss function as Cross Entropy loss
+#- Define optimiser as Adam with learning rate of 0.01
+#Flux's chain function joins multiple layers together, such that layer operations are performed on input sequentially. 
 m = Chain(
-  LSTM(N, 128),
-  LSTM(128, 128),
-  Dense(128, N),
-  softmax)
+  LSTM(N, 128), #Long Short-term Memory of feature space size 128
+  LSTM(128, 128), # output is 128-dimensional
+  Dense(128, N), #N = number of possible tokens
+  softmax) #calculate the probability of output char corr. to each possible char
 
-m = gpu(m)
+m = gpu(m) #use GPU acceleration
 
-function loss(xs, ys)
-  l = sum(crossentropy.(m.(gpu.(xs)), gpu.(ys)))
+function loss(xs, ys) #CE loss, or log loss quanitfies the performance of models with probability output
+  l = sum(crossentropy.(m.(gpu.(xs)), gpu.(ys))) #pass to GPU and get cost
   Flux.truncate!(m)
   return l
 end
 
-opt = ADAM(0.01)
+opt = ADAM(0.01) #use the ADAM optimiser with learning rate of 0.01
 tx, ty = (Xs[5], Ys[5])
 evalcb = () -> @show loss(tx, ty)
 
+# # 4. Train model
 Flux.train!(loss, params(m), zip(Xs, Ys), opt,
-            cb = throttle(evalcb, 30))
-
-# Sampling
+            cb = throttle(evalcb, 30)) #timeout for 30 secs
 
+# # 5. Sample from input.txt and test model
+# Compose a 1000-char long verse in the style of Shakespeare!
 function sample(m, alphabet, len)
-  m = cpu(m)
+  m = cpu(m) #use cpu as gpu offers minimal acc for seq models
   Flux.reset!(m)
   buf = IOBuffer()
-  c = rand(alphabet)
+  c = rand(alphabet) #take random input char token
   for i = 1:len
     write(buf, c)
+    #Compose like Shakespeare char-by-char! :
     c = wsample(alphabet, m(onehot(c, alphabet)).data)
   end
-  return String(take!(buf))
+  return String(take!(buf)) #get results from last LSTM hidden state
 end
 
+#Print results
 sample(m, alphabet, 1000) |> println