wojzaremba · kylefoley76 · Apr 10, 2015 · Apr 14, 2015 · Apr 14, 2015 · Apr 14, 2015
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.mat
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-Long Short Term Memory Units
+Long Short Term Memory Units (original README)
 ============================
 This is self-contained package to train a language model on word level Penn Tree Bank dataset. 
 It achieves 115 perplexity for a small model in 1h, and 81 perplexity for a big model in 
@@ -8,3 +8,16 @@ a different company).
 
 
 More information: http://arxiv.org/pdf/1409.2329v4.pdf
+
+For the Deep Learning NYU spring 2015 course
+==========================
+Modifications to the original code:
+
++ Made functions global and put the main part outside of a function, for easier interactive sessions.
++ Added a4\_commununication\_loop.lua for an example of stdin/stdout communication.
++ Added character-preprocessed train and validation ptb set in data/.
++ Modified data.lua so we can all easily load the data in the same way and agree on the dictionary. 
++ Added a simple script a4\_vocab.lua that loads the data and prints the character-level vocabulary (which is the vocabulary that will also be used in grading).
++ Added a4\_grading.py so you can test how your program performance will be automatically evaluated.
+
+For more information, see the assignment instructions pdf.
diff --git a/a4_communication_loop.lua b/a4_communication_loop.lua
@@ -0,0 +1,34 @@
+stringx = require('pl.stringx')
+require 'io'
+
+function readline()
+  local line = io.read("*line")
+  if line == nil then error({code="EOF"}) end
+  line = stringx.split(line)
+  if tonumber(line[1]) == nil then error({code="init"}) end
+  for i = 2,#line do
+    if line[i] ~= 'foo' then error({code="vocab", word = line[i]}) end
+  end
+  return line
+end
+
+while true do
+  print("Query: len word1 word2 etc")
+  local ok, line = pcall(readline)
+  if not ok then
+    if line.code == "EOF" then
+      break -- end loop
+    elseif line.code == "vocab" then
+      print("Word not in vocabulary, only 'foo' is in vocabulary: ", line.word)
+    elseif line.code == "init" then
+      print("Start with a number")
+    else
+      print(line)
+      print("Failed, try again")
+    end
+  else
+    print("Thanks, I will print foo " .. line[1] .. " more times")
+    for i = 1, line[1] do io.write('foo ') end
+    io.write('\n')
+  end
+end
diff --git a/a4_grading.py b/a4_grading.py
@@ -0,0 +1,51 @@
+import subprocess
+import numpy as np
+
+traintxt = open('data/ptb.char.train.txt').read()
+traintxt = traintxt.replace('\n', '<eos>').split()
+vocab_map = {}
+vocab_ix = 0
+for c in traintxt:
+    if c not in vocab_map:
+        vocab_map[c] = vocab_ix # note: zero-based vs lua: 1-based
+        vocab_ix = vocab_ix + 1
+
+#CHECK THAT THIS DICTIONARY IS THE SAME AS GENERATED BY LUA
+#lua_map = dict(tup.split() for tup in subprocess.check_output(['luajit','a4_vocab.lua']).split("\n") if tup)
+#for c,ix in vocab_map.iteritems():
+    #assert ix == int(lua_map[c]) - 1
+
+evaltxt = open('data/ptb.char.valid.txt').read()
+evaltxt = evaltxt.replace('\n', '<eos>').split()
+cmd = './run.sh'
+
+proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout = subprocess.PIPE)
+
+nll = 0
+fails = 0
+
+print "Flushing proc output until OK GO received"
+while True:
+    ret = proc.stdout.readline()
+    print " -- ", ret.strip()
+    if ret.strip() == "OK GO":
+        break
+
+print "Start test loop"
+
+for i in range(len(evaltxt)-1):
+    proc.stdin.write(evaltxt[i]+'\n')
+    try:
+        logprobs = map(float, proc.stdout.readline().split())
+        assert len(logprobs) == len(vocab_map), "Unexpected response length %d"%len(logprobs)
+        np.testing.assert_almost_equal(np.exp(logprobs).sum(), 1, decimal=4, err_msg="Nice try :)")
+        nll -= logprobs[vocab_map[evaltxt[i+1]]]
+    except Exception as e:
+        fails += 1
+        print "Fail #", fails, "errmsg:", str(e)
+    if i%1000 == 0:
+        avgnll  = nll / (i+1)
+        perp = np.exp(5.6 * avgnll)
+        print("%d\tPerplexity: %.6f\tavgnll: %.6f\tfails: %d" % (i, perp, avgnll,fails))
+
+
diff --git a/a4_vocab.lua b/a4_vocab.lua
@@ -0,0 +1,7 @@
+require 'torch'
+require 'base'
+ptb = require('data')
+traindata = ptb.traindataset(20)
+for k,v in pairs(ptb.vocab_map) do
+  print(k .. " " .. v)
+end
diff --git a/data.lua b/data.lua
@@ -11,6 +11,14 @@ local file = require('pl.file')
 
 local ptb_path = "./data/"
 
+local trainfn = ptb_path .. "ptb.train.txt"
+local testfn  = ptb_path .. "ptb.test.txt"
+local validfn = ptb_path .. "ptb.valid.txt"
+--[[
+local trainfn = ptb_path .. "ptb.char.train.txt"
+local validfn = ptb_path .. "ptb.char.valid.txt"
+--]]
+
 local vocab_idx = 0
 local vocab_map = {}
 
@@ -31,7 +39,7 @@ local function load_data(fname)
    local data = file.read(fname)
    data = stringx.replace(data, '\n', '<eos>')
    data = stringx.split(data)
-   print(string.format("Loading %s, size of data = %d", fname, #data))
+   --print(string.format("Loading %s, size of data = %d", fname, #data))
    local x = torch.zeros(#data)
    for i = 1, #data do
       if vocab_map[data[i]] == nil then
@@ -43,26 +51,29 @@ local function load_data(fname)
    return x
 end
 
-local function traindataset(batch_size)
-   local x = load_data(ptb_path .. "ptb.train.txt")
+local function traindataset(batch_size, char)
+   local x = load_data(trainfn)
    x = replicate(x, batch_size)
    return x
 end
 
 -- Intentionally we repeat dimensions without offseting.
 -- Pass over this batch corresponds to the fully sequential processing.
 local function testdataset(batch_size)
-   local x = load_data(ptb_path .. "ptb.test.txt")
-   x = x:resize(x:size(1), 1):expand(x:size(1), batch_size)
-   return x
+   if testfn then
+      local x = load_data(testfn)
+      x = x:resize(x:size(1), 1):expand(x:size(1), batch_size)
+      return x
+   end
 end
 
 local function validdataset(batch_size)
-   local x = load_data(ptb_path .. "ptb.valid.txt")
+   local x = load_data(validfn)
    x = replicate(x, batch_size)
    return x
 end
 
 return {traindataset=traindataset,
         testdataset=testdataset,
-        validdataset=validdataset}
+        validdataset=validdataset,
+        vocab_map=vocab_map}