From 1c08ba8f19f85a4f4fdb9febcdcaee0a2ac6858d Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Thu, 8 Jan 2015 13:14:01 -0500
Subject: [PATCH 01/27] better loading of imdb dataset

---
 code/imdb.py | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/code/imdb.py b/code/imdb.py
index f98c9601..0aaf641f 100644
--- a/code/imdb.py
+++ b/code/imdb.py
@@ -42,7 +42,33 @@ def prepare_data(seqs, labels, maxlen=None):
     return x, x_mask, labels
 
 
-def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
+def get_dataset_file(dataset, default_dataset, origin):
+    '''Look for it as if it was a full path, if not, try local file,
+    if not try in the data directory.
+
+    Download dataset if it is not present
+
+    '''
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == default_dataset:
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == default_dataset:
+        import urllib
+        print 'Downloading data from %s' % origin
+        urllib.urlretrieve(origin, dataset)
+    return dataset
+
+
+def load_data(path="imdb.pkl.gz", n_words=100000, valid_portion=0.1):
     ''' Loads the dataset
 
     :type dataset: string
@@ -53,10 +79,12 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
     # LOAD DATA #
     #############
 
-    print '... loading data'
-
     # Load the dataset
-    f = open(path, 'rb')
+    path = get_dataset_file(
+        path, "imdb.pkl.gz",
+        "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz")
+
+    f = gzip.open(path, 'rb')
     train_set = cPickle.load(f)
     test_set = cPickle.load(f)
     f.close()

From 859e9c3c49a03b868ce5eb94ad6a0165ce574d40 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Thu, 8 Jan 2015 13:51:37 -0500
Subject: [PATCH 02/27] don't use eval anymore

---
 code/lstm.py | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 2b57d8be..24f1959a 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -107,14 +107,10 @@ def init_tparams(params):
         tparams[kk] = theano.shared(params[kk], name=kk)
     return tparams
 
-layers = {'ff': ('param_init_fflayer', 'fflayer'),
-          'rconv': ('param_init_rconv', 'rconv_layer'),
-          'lstm': ('param_init_lstm', 'lstm_layer')}
-
 
 def get_layer(name):
     fns = layers[name]
-    return (eval(fns[0]), eval(fns[1]))
+    return fns
 
 
 def param_init_fflayer(options, params, prefix='ff'):
@@ -129,7 +125,7 @@ def param_init_fflayer(options, params, prefix='ff'):
 def fflayer(tparams, state_below, options, prefix='rconv', **kwargs):
     pre_act = (tensor.dot(state_below,
                           tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
-    return eval(options['activ'])(pre_act)
+    return options['activ'](pre_act)
 
 
 def ortho_weight(ndim):
@@ -229,7 +225,7 @@ def _step(m_, p_):
         ls_ = ps_
         ps_ = tensor.dot(ps_, tparams[_p(prefix, 'U')])
         pl_ = tensor.dot(p_, tparams[_p(prefix, 'W')])
-        newact = eval(options['activ'])(ps_+pl_+tparams[_p(prefix, 'b')])
+        newact = options['activ'](ps_+pl_+tparams[_p(prefix, 'b')])
 
         # gater
         gt_ = (tensor.dot(ls_, tparams[_p(prefix, 'GU')]) +
@@ -284,6 +280,11 @@ def _grab_root(seqlen, one_sample, prev_sample):
     return roots
 
 
+layers = {'ff': (param_init_fflayer, fflayer),
+          'rconv': (param_init_rconv, rconv_layer),
+          'lstm': (param_init_lstm, lstm_layer)}
+
+
 def adadelta(lr, tparams, grads, x, mask, y, cost):
     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
                                   name='%s_grad' % k)
@@ -432,21 +433,21 @@ def train(dim_proj=100,
           patience=10,
           max_epochs=5000,
           dispFreq=100,
-          activ='lambda x: tensor.tanh(x)',
+          activ=tensor.tanh,
           decay_c=0.,
           lrate=0.01,
           n_words=100000,
           data_sym=False,
-          optimizer='rmsprop',
-          encoder='rconv',
-          saveto='model.npz',
+          optimizer=rmsprop,
+          encoder='lstm',
+          saveto='lstm_model.npz',
           noise_std=0.,
           validFreq=1000,
           saveFreq=1000,  # save the parameters after every saveFreq updates
           maxlen=50,
           batch_size=16,
           valid_batch_size=16,
-          dataset='sentiment140',
+          dataset='imdb',
           use_dropout=False):
 
     # Model options
@@ -481,8 +482,8 @@ def train(dim_proj=100,
     f_grad = theano.function([x, mask, y], grads)
 
     lr = tensor.scalar(name='lr')
-    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads,
-                                              x, mask, y, cost)
+    f_grad_shared, f_update = optimizer(lr, tparams, grads,
+                                        x, mask, y, cost)
 
     print 'Optimization'
 
@@ -618,12 +619,12 @@ def main(job_id, params):
 
 if __name__ == '__main__':
     main(0, {
-        'model': ['model_lstm.npz'],
+        'model': ['lstm_model.npz'],
         'encoder': ['lstm'],
         'dim-proj': [128],
         'n-words': [10000],
-        'optimizer': ['adadelta'],
-        'activ': ['lambda x: tensor.tanh(x)'],
+        'optimizer': [adadelta],  # adadelta and rmsprop avail
+        'activ': [tensor.tanh],  # The activation function from Theano.
         'decay-c': [0.],
         'use-dropout': [1],
         'learning-rate': [0.0001]})

From e57dd0371cd2e1d204660c807f8321b5892b6803 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Thu, 8 Jan 2015 13:52:11 -0500
Subject: [PATCH 03/27] force floatX=float32, as otherwise there is problem.
 The learning rate 0.005 get cast to float64

---
 code/lstm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/code/lstm.py b/code/lstm.py
index 24f1959a..ca35616f 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -618,6 +618,10 @@ def main(job_id, params):
     return validerr
 
 if __name__ == '__main__':
+
+    # We must have floatX=float32 for this tutorial to work correctly.
+    theano.config.floatX = "float32"
+
     main(0, {
         'model': ['lstm_model.npz'],
         'encoder': ['lstm'],

From e85246f0899857f648e6eb46627b89a98391456f Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Thu, 8 Jan 2015 15:18:29 -0500
Subject: [PATCH 04/27] lstm: add comments

---
 code/lstm.py | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index ca35616f..dedc0401 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -44,11 +44,17 @@ def get_dataset(name):
 
 
 def zipp(params, tparams):
+    """
+    When we reload the model. Needed for the GPU stuff.
+    """
     for kk, vv in params.iteritems():
         tparams[kk].set_value(vv)
 
 
 def unzip(zipped):
+    """
+    When we pickle the model. Needed for the GPU stuff.
+    """
     new_params = OrderedDict()
     for kk, vv in zipped.iteritems():
         new_params[kk] = vv.get_value()
@@ -79,7 +85,6 @@ def init_params(options):
     randn = numpy.random.rand(options['n_words'],
                               options['dim_proj'])
     params['Wemb'] = (0.01 * randn).astype('float32')
-    # rconv
     params = get_layer(options['encoder'])[0](options,
                                               params,
                                               prefix=options['encoder'])
@@ -122,7 +127,7 @@ def param_init_fflayer(options, params, prefix='ff'):
     return params
 
 
-def fflayer(tparams, state_below, options, prefix='rconv', **kwargs):
+def fflayer(tparams, state_below, options, prefix='ff', **kwargs):
     pre_act = (tensor.dot(state_below,
                           tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
     return options['activ'](pre_act)
@@ -396,6 +401,9 @@ def build_model(tparams, options):
 
 
 def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
+    """ If you want to use a trained model, this is useful to compute
+    the probabilities of new examples.
+    """
     n_samples = len(data[0])
     probs = numpy.zeros((n_samples, 2)).astype('float32')
 
@@ -416,6 +424,11 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
 
 
 def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
+    """
+    Just compute the error
+    f_pred: Theano fct computing the prediction
+    prepare_data: usual prepare_data for that dataset.
+    """
     valid_err = 0
     for _, valid_index in iterator:
         x, mask, y = prepare_data([data[0][t] for t in valid_index],
@@ -430,19 +443,18 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
 
 
 def train(dim_proj=100,
-          patience=10,
+          patience=10,  # number of epoch to wait before early stop if no progress
           max_epochs=5000,
-          dispFreq=100,
+          dispFreq=100,  # display to stdout the training progress every N updates
           activ=tensor.tanh,
-          decay_c=0.,
-          lrate=0.01,
-          n_words=100000,
-          data_sym=False,
-          optimizer=rmsprop,
-          encoder='lstm',
+          decay_c=0.,  # weight decay for the classifier
+          lrate=0.01,  # learning rate for sgd (not used for adadelta and rmsprop)
+          n_words=100000,  # wocabulary size
+          optimizer=adadelta,
+          encoder='lstm',# can be removed must be lstm.
           saveto='lstm_model.npz',
           noise_std=0.,
-          validFreq=1000,
+          validFreq=1000, # after 1000
           saveFreq=1000,  # save the parameters after every saveFreq updates
           maxlen=50,
           batch_size=16,
@@ -478,7 +490,7 @@ def train(dim_proj=100,
 
     f_cost = theano.function([x, mask, y], cost)
 
-    grads = tensor.grad(cost, wrt=itemlist(tparams))
+    grads = tensor.grad(cost, wrt=tparams.values())
     f_grad = theano.function([x, mask, y], grads)
 
     lr = tensor.scalar(name='lr')
@@ -627,8 +639,8 @@ def main(job_id, params):
         'encoder': ['lstm'],
         'dim-proj': [128],
         'n-words': [10000],
-        'optimizer': [adadelta],  # adadelta and rmsprop avail
+        'optimizer': [adadelta],  # sgd, adadelta and rmsprop available
         'activ': [tensor.tanh],  # The activation function from Theano.
-        'decay-c': [0.],
-        'use-dropout': [1],
+        'decay-c': [0.], #
+        'use-dropout': [1],  # if disable slightly faster, but worst test error.
         'learning-rate': [0.0001]})

From 4eeee9825aa12a42f2f60c29699890668397c774 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Thu, 8 Jan 2015 15:18:57 -0500
Subject: [PATCH 05/27] lstm: remove rconv code

---
 code/lstm.py | 85 ++--------------------------------------------------
 1 file changed, 2 insertions(+), 83 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index dedc0401..181b3578 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -203,90 +203,9 @@ def _step(m_, x_, h_, c_):
     return rval[0]
 
 
-def param_init_rconv(options, params, prefix='rconv'):
-    params[_p(prefix, 'W')] = ortho_weight(options['dim_proj'])
-    params[_p(prefix, 'U')] = ortho_weight(options['dim_proj'])
-    b = numpy.zeros((options['dim_proj'],)).astype('float32')
-    params[_p(prefix, 'b')] = b
-    gw = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
-    params[_p(prefix, 'GW')] = gw
-    gu = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
-    params[_p(prefix, 'GU')] = gu
-    params[_p(prefix, 'Gb')] = numpy.zeros((3,)).astype('float32')
-
-    return params
-
-
-def rconv_layer(tparams, state_below, options, prefix='rconv', mask=None):
-    nsteps = state_below.shape[0]
-
-    assert mask is not None
-
-    def _step(m_, p_):
-        l_ = p_
-        # new activation
-        ps_ = tensor.zeros_like(p_)
-        ps_ = tensor.set_subtensor(ps_[1:], p_[:-1])
-        ls_ = ps_
-        ps_ = tensor.dot(ps_, tparams[_p(prefix, 'U')])
-        pl_ = tensor.dot(p_, tparams[_p(prefix, 'W')])
-        newact = options['activ'](ps_+pl_+tparams[_p(prefix, 'b')])
-
-        # gater
-        gt_ = (tensor.dot(ls_, tparams[_p(prefix, 'GU')]) +
-               tensor.dot(l_, tparams[_p(prefix, 'GW')]) +
-               tparams[_p(prefix, 'Gb')])
-        if l_.ndim == 3:
-            gt_shp = gt_.shape
-            gt_ = gt_.reshape((gt_shp[0] * gt_shp[1], gt_shp[2]))
-        gt_ = tensor.nnet.softmax(gt_)
-        if l_.ndim == 3:
-            gt_ = gt_.reshape((gt_shp[0], gt_shp[1], gt_shp[2]))
-
-        if p_.ndim == 3:
-            gn = gt_[:, :, 0].dimshuffle(0, 1, 'x')
-            gl = gt_[:, :, 1].dimshuffle(0, 1, 'x')
-            gr = gt_[:, :, 2].dimshuffle(0, 1, 'x')
-        else:
-            gn = gt_[:, 0].dimshuffle(0, 'x')
-            gl = gt_[:, 1].dimshuffle(0, 'x')
-            gr = gt_[:, 2].dimshuffle(0, 'x')
-
-        act = newact * gn + ls_ * gl + l_ * gr
-
-        if p_.ndim == 3:
-            m_ = m_.dimshuffle('x', 0, 'x')
-        else:
-            m_ = m_.dimshuffle('x', 0)
-        return tensor.switch(m_, act, l_)
-
-    rval, updates = theano.scan(_step,
-                                sequences=[mask[1:]],
-                                outputs_info=[state_below],
-                                name='layer_%s' % prefix,
-                                n_steps=nsteps-1)
-
-    seqlens = tensor.cast(mask.sum(axis=0), 'int64')-1
-    roots = rval[-1]
-
-    if state_below.ndim == 3:
-        def _grab_root(seqlen, one_sample, prev_sample):
-            return one_sample[seqlen]
-
-        dim_proj = options['dim_proj']
-        roots, updates = theano.scan(_grab_root,
-                                     sequences=[seqlens,
-                                                roots.dimshuffle(1, 0, 2)],
-                                     outputs_info=[tensor.alloc(0., dim_proj)],
-                                     name='grab_root_%s' % prefix)
-    else:
-        roots = roots[seqlens]  # there should be only one, so it's fine.
-
-    return roots
-
-
+# ff: Feed Forward (normal neural net), only useful to put after lstm
+#     before the classifier.
 layers = {'ff': (param_init_fflayer, fflayer),
-          'rconv': (param_init_rconv, rconv_layer),
           'lstm': (param_init_lstm, lstm_layer)}
 
 

From 6fe4fa020fc7236ae8c4c69f2d154198ae048591 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Thu, 8 Jan 2015 16:38:19 -0500
Subject: [PATCH 06/27] Code simplification.

---
 code/lstm.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 181b3578..f0361512 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -61,10 +61,6 @@ def unzip(zipped):
     return new_params
 
 
-def itemlist(tparams):
-    return [vv for kk, vv in tparams.iteritems()]
-
-
 def dropout_layer(state_before, use_noise, trng):
     proj = tensor.switch(use_noise,
                          (state_before *
@@ -232,7 +228,7 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
                                      running_grads2)]
     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
              for ru2, ud in zip(running_up2, updir)]
-    param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
+    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
 
     f_update = theano.function([lr], [], updates=ru2up+param_up,
                                on_unused_input='ignore')
@@ -266,7 +262,7 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                             running_grads2)]
     param_up = [(p, p + udn[1])
-                for p, udn in zip(itemlist(tparams), updir_new)]
+                for p, udn in zip(tparams.values(), updir_new)]
     f_update = theano.function([lr], [], updates=updir_new+param_up,
                                on_unused_input='ignore')
 
@@ -280,7 +276,7 @@ def sgd(lr, tparams, grads, x, mask, y, cost):
 
     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup)
 
-    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
+    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
     f_update = theano.function([lr], [], updates=pup)
 
     return f_grad_shared, f_update

From 6b7d587a38c6ada08266ad51dcc336d819ffc0d9 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Fri, 9 Jan 2015 12:00:37 -0500
Subject: [PATCH 07/27] Fix typo, add docstring, add timming, remove useless
 printing

---
 code/imdb.py |  9 +++++++++
 code/lstm.py | 29 ++++++++++++++++++-----------
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/code/imdb.py b/code/imdb.py
index 0aaf641f..73e2d7b7 100644
--- a/code/imdb.py
+++ b/code/imdb.py
@@ -11,6 +11,15 @@
 
 
 def prepare_data(seqs, labels, maxlen=None):
+    """Create the matrices from the datasets.
+
+    This pad each sequence to the same lenght: the lenght of the
+    longuest sequence or maxlen.
+
+    if maxlen is set, we will cut all sequence to this maximum
+    lenght.
+
+    """
     # x: a list of sentences
     lengths = [len(s) for s in seqs]
 
diff --git a/code/lstm.py b/code/lstm.py
index f0361512..e3c21f5e 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -1,16 +1,17 @@
 '''
 Build a tweet sentiment analyzer
 '''
-import theano
-import theano.tensor as tensor
-from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
-
-import cPickle as pkl
-import numpy
+from collections import OrderedDict
 import copy
+import cPickle as pkl
 import random
+import sys
+import time
 
-from collections import OrderedDict
+import numpy
+import theano
+import theano.tensor as tensor
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
 import imdb
 
@@ -364,7 +365,7 @@ def train(dim_proj=100,
           activ=tensor.tanh,
           decay_c=0.,  # weight decay for the classifier
           lrate=0.01,  # learning rate for sgd (not used for adadelta and rmsprop)
-          n_words=100000,  # wocabulary size
+          n_words=100000,  # vocabulary size
           optimizer=adadelta,
           encoder='lstm',# can be removed must be lstm.
           saveto='lstm_model.npz',
@@ -432,6 +433,7 @@ def train(dim_proj=100,
 
     uidx = 0
     estop = False
+    start_time = time.clock()
     for eidx in xrange(max_epochs):
         n_samples = 0
 
@@ -502,9 +504,11 @@ def train(dim_proj=100,
 
         if estop:
             break
-
+    end_time = time.clock()
     if best_p is not None:
         zipp(best_p, tparams)
+    else:
+        best_p = unzip(tparams)
 
     use_noise.set_value(0.)
     train_err = pred_error(f_pred, prepare_data, train, kf)
@@ -518,12 +522,15 @@ def train(dim_proj=100,
                 valid_err=valid_err, test_err=test_err,
                 history_errs=history_errs, **params)
 
+    print 'The code run for %d epochs, with %f epochs/sec' % (
+        uidx, 1. * uidx / (end_time - start_time))
+    print >> sys.stderr, ('The code for file ' +
+                          os.path.split(__file__)[1] +
+                          ' ran for %.1fs' % ((end_time - start_time)))
     return train_err, valid_err, test_err
 
 
 def main(job_id, params):
-    print ('Anything printed here will end up in the output directory'
-           'for job #%d' % job_id)
     print params
     use_dropout = True if params['use-dropout'][0] else False
     trainerr, validerr, testerr = train(saveto=params['model'][0],

From 1d12bee18115e0a150d2ed92780cb3884b42ff88 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Fri, 9 Jan 2015 13:38:08 -0500
Subject: [PATCH 08/27] use the not compressed version of imdb. This take 1s to
 load instead of 45s

---
 code/imdb.py     | 10 +++++++---
 code/lstm.py     |  2 +-
 data/download.sh |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/code/imdb.py b/code/imdb.py
index 73e2d7b7..1bcc83bb 100644
--- a/code/imdb.py
+++ b/code/imdb.py
@@ -77,7 +77,7 @@ def get_dataset_file(dataset, default_dataset, origin):
     return dataset
 
 
-def load_data(path="imdb.pkl.gz", n_words=100000, valid_portion=0.1):
+def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
     ''' Loads the dataset
 
     :type dataset: string
@@ -91,9 +91,13 @@ def load_data(path="imdb.pkl.gz", n_words=100000, valid_portion=0.1):
     # Load the dataset
     path = get_dataset_file(
         path, "imdb.pkl.gz",
-        "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz")
+        "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
+
+    if path.endswith(".gz"):
+        f = gzip.open(path, 'rb')
+    else:
+        f = open(path, 'rb')
 
-    f = gzip.open(path, 'rb')
     train_set = cPickle.load(f)
     test_set = cPickle.load(f)
     f.close()
diff --git a/code/lstm.py b/code/lstm.py
index e3c21f5e..c5e2bd98 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -471,7 +471,7 @@ def train(dim_proj=100,
                 else:
                     params = unzip(tparams)
                 numpy.savez(saveto, history_errs=history_errs, **params)
-                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
+                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                 print 'Done'
 
             if numpy.mod(uidx, validFreq) == 0:
diff --git a/data/download.sh b/data/download.sh
index 8a8e9a92..88e48e5a 100755
--- a/data/download.sh
+++ b/data/download.sh
@@ -15,7 +15,7 @@ fi
 
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz
-$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz && gunzip imdb.pkl.gz
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
 $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold0.pkl.gz

From f96d201b49a9cc4ff39a32531f1dc186abd6e9b1 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Fri, 9 Jan 2015 16:41:24 -0500
Subject: [PATCH 09/27] remove import not used

---
 code/imdb.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/code/imdb.py b/code/imdb.py
index 1bcc83bb..c33884d6 100644
--- a/code/imdb.py
+++ b/code/imdb.py
@@ -1,13 +1,10 @@
 import cPickle
 import gzip
 import os
-import sys
-import time
 
 import numpy
 
 import theano
-import theano.tensor as T
 
 
 def prepare_data(seqs, labels, maxlen=None):

From 9942cb826b59773549fa9f36be291ddc94facb1b Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Fri, 9 Jan 2015 16:42:17 -0500
Subject: [PATCH 10/27] add name to fct

---
 code/lstm.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index c5e2bd98..23887e06 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -221,7 +221,8 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
              for rg2, g in zip(running_grads2, grads)]
 
-    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up)
+    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up,
+                                    name='adadelta_f_grad_shared')
 
     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
              for zg, ru2, rg2 in zip(zipped_grads,
@@ -232,7 +233,8 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
 
     f_update = theano.function([lr], [], updates=ru2up+param_up,
-                               on_unused_input='ignore')
+                               on_unused_input='ignore',
+                               name='adadelta_f_update')
 
     return f_grad_shared, f_update
 
@@ -254,7 +256,8 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
              for rg2, g in zip(running_grads2, grads)]
 
     f_grad_shared = theano.function([x, mask, y], cost,
-                                    updates=zgup + rgup + rg2up)
+                                    updates=zgup + rgup + rg2up,
+                                    name='rmsprop_f_grad_shared')
 
     updir = [theano.shared(p.get_value() * numpy.float32(0.),
                            name='%s_updir' % k)
@@ -265,7 +268,8 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
     param_up = [(p, p + udn[1])
                 for p, udn in zip(tparams.values(), updir_new)]
     f_update = theano.function([lr], [], updates=updir_new+param_up,
-                               on_unused_input='ignore')
+                               on_unused_input='ignore',
+                               name='rmsprop_f_update')
 
     return f_grad_shared, f_update
 
@@ -275,10 +279,12 @@ def sgd(lr, tparams, grads, x, mask, y, cost):
                for k, p in tparams.iteritems()]
     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
 
-    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup)
+    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
+                                    name='sgd_f_grad_shared')
 
     pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
-    f_update = theano.function([lr], [], updates=pup)
+    f_update = theano.function([lr], [], updates=pup,
+                               name='sgd_f_update')
 
     return f_grad_shared, f_update
 
@@ -308,8 +314,8 @@ def build_model(tparams, options):
 
     pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b'])
 
-    f_pred_prob = theano.function([x, mask], pred)
-    f_pred = theano.function([x, mask], pred.argmax(axis=1))
+    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
+    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
 
     cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
 
@@ -404,10 +410,10 @@ def train(dim_proj=100,
         weight_decay *= decay_c
         cost += weight_decay
 
-    f_cost = theano.function([x, mask, y], cost)
+    f_cost = theano.function([x, mask, y], cost, name='f_cost')
 
     grads = tensor.grad(cost, wrt=tparams.values())
-    f_grad = theano.function([x, mask, y], grads)
+    f_grad = theano.function([x, mask, y], grads, name='f_grad')
 
     lr = tensor.scalar(name='lr')
     f_grad_shared, f_update = optimizer(lr, tparams, grads,

From 1e6bce295f5a90df5e22cb7168c6b308cb1b7a34 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Fri, 9 Jan 2015 16:43:55 -0500
Subject: [PATCH 11/27] add comment

---
 code/lstm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 23887e06..3d43e3ca 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -368,11 +368,11 @@ def train(dim_proj=100,
           patience=10,  # number of epoch to wait before early stop if no progress
           max_epochs=5000,
           dispFreq=100,  # display to stdout the training progress every N updates
-          activ=tensor.tanh,
+          activ=tensor.tanh,  # The activation function from Theano.
           decay_c=0.,  # weight decay for the classifier
           lrate=0.01,  # learning rate for sgd (not used for adadelta and rmsprop)
           n_words=100000,  # vocabulary size
-          optimizer=adadelta,
+          optimizer=adadelta,    # sgd, adadelta and rmsprop available
           encoder='lstm',# can be removed must be lstm.
           saveto='lstm_model.npz',
           noise_std=0.,
@@ -382,7 +382,8 @@ def train(dim_proj=100,
           batch_size=16,
           valid_batch_size=16,
           dataset='imdb',
-          use_dropout=False):
+          use_dropout=False,  # if False slightly faster, but worst test error
+      ):
 
     # Model options
     model_options = locals().copy()

From 1b17e874e2e6eaa31344e48c09456303cd9a159a Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Fri, 9 Jan 2015 16:45:27 -0500
Subject: [PATCH 12/27] pep8 printing

---
 code/lstm.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 3d43e3ca..48d012e9 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -372,11 +372,11 @@ def train(dim_proj=100,
           decay_c=0.,  # weight decay for the classifier
           lrate=0.01,  # learning rate for sgd (not used for adadelta and rmsprop)
           n_words=100000,  # vocabulary size
-          optimizer=adadelta,    # sgd, adadelta and rmsprop available
-          encoder='lstm',# can be removed must be lstm.
+          optimizer=adadelta,  # sgd, adadelta and rmsprop available
+          encoder='lstm',  # can be removed must be lstm.
           saveto='lstm_model.npz',
           noise_std=0.,
-          validFreq=1000, # after 1000
+          validFreq=1000,  # after 1000
           saveFreq=1000,  # save the parameters after every saveFreq updates
           maxlen=50,
           batch_size=16,
@@ -529,11 +529,10 @@ def train(dim_proj=100,
                 valid_err=valid_err, test_err=test_err,
                 history_errs=history_errs, **params)
 
-    print 'The code run for %d epochs, with %f epochs/sec' % (
-        uidx, 1. * uidx / (end_time - start_time))
-    print >> sys.stderr, ('The code for file ' +
-                          os.path.split(__file__)[1] +
-                          ' ran for %.1fs' % ((end_time - start_time)))
+    print 'The code run for %d epochs, with %f sec/epochs' % (
+        (eidx + 1), 1. * (eidx + 1) / (end_time - start_time))
+    print >> sys.stderr, ('Training took %.1fs minutes' %
+                          (end_time - start_time))
     return train_err, valid_err, test_err
 
 

From 64eeb12543a88f0f1ab2ea4abd4c584e562efce1 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Fri, 9 Jan 2015 16:48:42 -0500
Subject: [PATCH 13/27] code simplification

---
 code/lstm.py | 60 +++++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 48d012e9..df3e380e 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -374,16 +374,16 @@ def train(dim_proj=100,
           n_words=100000,  # vocabulary size
           optimizer=adadelta,  # sgd, adadelta and rmsprop available
           encoder='lstm',  # can be removed must be lstm.
-          saveto='lstm_model.npz',
+          saveto='lstm_model.npz',  # The best model will be saved there
           noise_std=0.,
           validFreq=1000,  # after 1000
           saveFreq=1000,  # save the parameters after every saveFreq updates
-          maxlen=50,
+          maxlen=50,  # longer sequence get ignored
           batch_size=16,
           valid_batch_size=16,
           dataset='imdb',
           use_dropout=False,  # if False slightly faster, but worst test error
-      ):
+):
 
     # Model options
     model_options = locals().copy()
@@ -536,39 +536,31 @@ def train(dim_proj=100,
     return train_err, valid_err, test_err
 
 
-def main(job_id, params):
-    print params
-    use_dropout = True if params['use-dropout'][0] else False
-    trainerr, validerr, testerr = train(saveto=params['model'][0],
-                                        dim_proj=params['dim-proj'][0],
-                                        n_words=params['n-words'][0],
-                                        decay_c=params['decay-c'][0],
-                                        lrate=params['learning-rate'][0],
-                                        optimizer=params['optimizer'][0],
-                                        activ=params['activ'][0],
-                                        encoder=params['encoder'][0],
-                                        maxlen=600,
-                                        batch_size=16,
-                                        valid_batch_size=16,
-                                        validFreq=10000,
-                                        dispFreq=10,
-                                        saveFreq=100000,
-                                        dataset='imdb',
-                                        use_dropout=use_dropout)
-    return validerr
-
 if __name__ == '__main__':
 
     # We must have floatX=float32 for this tutorial to work correctly.
     theano.config.floatX = "float32"
+    theano.config.scan.allow_gc = False
+
+    # See function train for all possible parameter and there definition.
+    trainerr, validerr, testerr = train(
+        saveto='lstm_model.npz',  # The best model will be saved there
+        dim_proj=128,
+        n_words=10000,
+        decay_c=0,
+        lrate=0.0001,
+        optimizer=sgd,
+        activ=tensor.tanh,
+        encoder='lstm',
+        maxlen=100,  # longer get ignored
+        batch_size=64,
+        valid_batch_size=64,
+        validFreq=10000,
+        dispFreq=10,
+        saveFreq=100000,
+        dataset='imdb',
+        use_dropout=True,
+
+        max_epochs=2,
+    )
 
-    main(0, {
-        'model': ['lstm_model.npz'],
-        'encoder': ['lstm'],
-        'dim-proj': [128],
-        'n-words': [10000],
-        'optimizer': [adadelta],  # sgd, adadelta and rmsprop available
-        'activ': [tensor.tanh],  # The activation function from Theano.
-        'decay-c': [0.], #
-        'use-dropout': [1],  # if disable slightly faster, but worst test error.
-        'learning-rate': [0.0001]})

From 86e5c4b49e5b6c9e2134fc1959339a2f314cfca3 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Sat, 10 Jan 2015 15:04:36 -0500
Subject: [PATCH 14/27] move sgd and comments

---
 code/lstm.py | 118 ++++++++++++++++++++++++++-------------------------
 1 file changed, 61 insertions(+), 57 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index df3e380e..2bb845bb 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -206,6 +206,34 @@ def _step(m_, x_, h_, c_):
           'lstm': (param_init_lstm, lstm_layer)}
 
 
+def sgd(lr, tparams, grads, x, mask, y, cost):
+    """ Stochastic Gradient Descent
+
+    :note: A more complicated version of sgd then needed.  This is
+        done like that for adadelta and rmsprop.
+
+    """
+    # New set of shared variable that will contain the gradient
+    # for a mini-batch.
+    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
+               for k, p in tparams.iteritems()]
+    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
+
+    # Function that computes gradients for a mini-batch, but do not
+    # updates the weights.
+    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
+                                    name='sgd_f_grad_shared')
+
+    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
+
+    # Function that updates the weights from the previously computed
+    # gradient.
+    f_update = theano.function([lr], [], updates=pup,
+                               name='sgd_f_update')
+
+    return f_grad_shared, f_update
+
+
 def adadelta(lr, tparams, grads, x, mask, y, cost):
     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
                                   name='%s_grad' % k)
@@ -274,21 +302,6 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
     return f_grad_shared, f_update
 
 
-def sgd(lr, tparams, grads, x, mask, y, cost):
-    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
-               for k, p in tparams.iteritems()]
-    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
-
-    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
-                                    name='sgd_f_grad_shared')
-
-    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
-    f_update = theano.function([lr], [], updates=pup,
-                               name='sgd_f_update')
-
-    return f_grad_shared, f_update
-
-
 def build_model(tparams, options):
     trng = RandomStreams(1234)
     use_noise = theano.shared(numpy.float32(0.))
@@ -319,7 +332,7 @@ def build_model(tparams, options):
 
     cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
 
-    return trng, use_noise, x, mask, y, f_pred_prob, f_pred, cost
+    return use_noise, x, mask, y, f_pred_prob, f_pred, cost
 
 
 def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
@@ -364,25 +377,29 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
     return valid_err
 
 
-def train(dim_proj=100,
-          patience=10,  # number of epoch to wait before early stop if no progress
-          max_epochs=5000,
-          dispFreq=100,  # display to stdout the training progress every N updates
-          activ=tensor.tanh,  # The activation function from Theano.
-          decay_c=0.,  # weight decay for the classifier
-          lrate=0.01,  # learning rate for sgd (not used for adadelta and rmsprop)
-          n_words=100000,  # vocabulary size
-          optimizer=adadelta,  # sgd, adadelta and rmsprop available
-          encoder='lstm',  # can be removed must be lstm.
-          saveto='lstm_model.npz',  # The best model will be saved there
-          noise_std=0.,
-          validFreq=1000,  # after 1000
-          saveFreq=1000,  # save the parameters after every saveFreq updates
-          maxlen=50,  # longer sequence get ignored
-          batch_size=16,
-          valid_batch_size=16,
-          dataset='imdb',
-          use_dropout=False,  # if False slightly faster, but worst test error
+def test_lstm(
+    dim_proj=128,  # TODO: What is this
+    patience=10,  # number of epoch to wait before early stop if no progress
+    max_epochs=5000,  # The maximum number of epoch to run
+    dispFreq=10,  # display to stdout the training progress every N updates
+    activ=tensor.tanh,  # The activation function from Theano.
+    decay_c=0.,  # weight decay for the classifier applied to the U weights.
+    lrate=0.0001,  # learning rate for sgd (not used for adadelta and rmsprop)
+    n_words=10000,  # vocabulary size
+    optimizer=sgd,  # sgd, adadelta and rmsprop available
+    encoder='lstm',  # TODO: can be removed must be lstm.
+    saveto='lstm_model.npz',  # The best model will be saved there
+    validFreq=10000,  # after 1000
+    saveFreq=100000,  # save the parameters after every saveFreq updates
+    maxlen=100,  # longer sequence get ignored
+    batch_size=64,
+    valid_batch_size=64,
+    dataset='imdb',
+
+    # Parameter for extra option
+    noise_std=0.,
+    use_dropout=False,  # if False slightly faster, but worst test error
+                        # TODO: This frequently need a bigger model.
 ):
 
     # Model options
@@ -398,10 +415,17 @@ def train(dim_proj=100,
     model_options['ydim'] = ydim
 
     print 'Building model'
+    # This create the initial parameters as numpy ndarrays.
+    # Dict name (string) -> numpy ndarray
     params = init_params(model_options)
+
+    # This create Theano Shared Variable from the parameters.
+    # Dict name (string) -> Theano Tensor Shared Variable
+    # params and tparams have different copy of the weights.
     tparams = init_tparams(params)
 
-    (trng, use_noise, x, mask,
+    # use_noise is for dropout
+    (use_noise, x, mask,
      y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
 
     if decay_c > 0.:
@@ -543,24 +567,4 @@ def train(dim_proj=100,
     theano.config.scan.allow_gc = False
 
     # See function train for all possible parameter and there definition.
-    trainerr, validerr, testerr = train(
-        saveto='lstm_model.npz',  # The best model will be saved there
-        dim_proj=128,
-        n_words=10000,
-        decay_c=0,
-        lrate=0.0001,
-        optimizer=sgd,
-        activ=tensor.tanh,
-        encoder='lstm',
-        maxlen=100,  # longer get ignored
-        batch_size=64,
-        valid_batch_size=64,
-        validFreq=10000,
-        dispFreq=10,
-        saveFreq=100000,
-        dataset='imdb',
-        use_dropout=True,
-
-        max_epochs=2,
-    )
-
+    test_lstm(max_epochs=10)

From c480d4eb1523e014ea8bb4579565fffdbd0b0583 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Sat, 10 Jan 2015 15:07:20 -0500
Subject: [PATCH 15/27] remove fflayers

---
 code/lstm.py | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 2bb845bb..c7f0c85f 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -115,21 +115,6 @@ def get_layer(name):
     return fns
 
 
-def param_init_fflayer(options, params, prefix='ff'):
-    weights = numpy.random.randn(options['dim_proj'], options['dim_proj'])
-    biases = numpy.zeros((options['dim_proj'], ))
-    params[_p(prefix, 'W')] = 0.01 * weights.astype('float32')
-    params[_p(prefix, 'b')] = biases.astype('float32')
-
-    return params
-
-
-def fflayer(tparams, state_below, options, prefix='ff', **kwargs):
-    pre_act = (tensor.dot(state_below,
-                          tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
-    return options['activ'](pre_act)
-
-
 def ortho_weight(ndim):
     W = numpy.random.randn(ndim, ndim)
     u, s, v = numpy.linalg.svd(W)
@@ -202,8 +187,7 @@ def _step(m_, x_, h_, c_):
 
 # ff: Feed Forward (normal neural net), only useful to put after lstm
 #     before the classifier.
-layers = {'ff': (param_init_fflayer, fflayer),
-          'lstm': (param_init_lstm, lstm_layer)}
+layers = {'lstm': (param_init_lstm, lstm_layer)}
 
 
 def sgd(lr, tparams, grads, x, mask, y, cost):
@@ -382,7 +366,6 @@ def test_lstm(
     patience=10,  # number of epoch to wait before early stop if no progress
     max_epochs=5000,  # The maximum number of epoch to run
     dispFreq=10,  # display to stdout the training progress every N updates
-    activ=tensor.tanh,  # The activation function from Theano.
     decay_c=0.,  # weight decay for the classifier applied to the U weights.
     lrate=0.0001,  # learning rate for sgd (not used for adadelta and rmsprop)
     n_words=10000,  # vocabulary size

From 2e022a636c62ed7b8f6f536124a22d791e4a179b Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Mon, 12 Jan 2015 11:18:46 -0500
Subject: [PATCH 16/27] small fixes and doc

---
 code/lstm.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index c7f0c85f..d23c6e76 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -19,6 +19,9 @@
 
 
 def get_minibatches_idx(n, nb_batches, shuffle=False):
+    """
+    Used to shuffle the dataset at each iteration.
+    """
 
     idx_list = numpy.arange(n, dtype="int32")
 
@@ -381,8 +384,8 @@ def test_lstm(
 
     # Parameter for extra option
     noise_std=0.,
-    use_dropout=False,  # if False slightly faster, but worst test error
-                        # TODO: This frequently need a bigger model.
+    use_dropout=True,  # if False slightly faster, but worst test error
+                       # This frequently need a bigger model.
 ):
 
     # Model options
@@ -502,6 +505,10 @@ def test_lstm(
 
                     best_p = unzip(tparams)
                     bad_counter = 0
+
+                print ('Train ', train_err, 'Valid ', valid_err,
+                       'Test ', test_err)
+
                 if (len(history_errs) > patience and
                     valid_err >= numpy.array(history_errs)[:-patience,
                                                            0].min()):
@@ -511,9 +518,6 @@ def test_lstm(
                         estop = True
                         break
 
-                print ('Train ', train_err, 'Valid ', valid_err,
-                       'Test ', test_err)
-
         print 'Seen %d samples' % n_samples
 
         if estop:
@@ -537,8 +541,8 @@ def test_lstm(
                 history_errs=history_errs, **params)
 
     print 'The code run for %d epochs, with %f sec/epochs' % (
-        (eidx + 1), 1. * (eidx + 1) / (end_time - start_time))
-    print >> sys.stderr, ('Training took %.1fs minutes' %
+        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
+    print >> sys.stderr, ('Training took %.1fs' %
                           (end_time - start_time))
     return train_err, valid_err, test_err
 
@@ -547,6 +551,7 @@ def test_lstm(
 
     # We must have floatX=float32 for this tutorial to work correctly.
     theano.config.floatX = "float32"
+    # The next line is the new Theano default. This is a speed up.
     theano.config.scan.allow_gc = False
 
     # See function train for all possible parameter and there definition.

From 8afe749dd5373b7378170f960215ee1414faaec7 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Mon, 12 Jan 2015 15:40:58 -0500
Subject: [PATCH 17/27] small update

---
 code/lstm.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index d23c6e76..200b359f 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -291,6 +291,8 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
 
 def build_model(tparams, options):
     trng = RandomStreams(1234)
+
+    # Used for dropout.
     use_noise = theano.shared(numpy.float32(0.))
 
     x = tensor.matrix('x', dtype='int64')
@@ -378,8 +380,8 @@ def test_lstm(
     validFreq=10000,  # after 1000
     saveFreq=100000,  # save the parameters after every saveFreq updates
     maxlen=100,  # longer sequence get ignored
-    batch_size=64,
-    valid_batch_size=64,
+    batch_size=64,  # the batch size during training.
+    valid_batch_size=64,  # The batch size during validation
     dataset='imdb',
 
     # Parameter for extra option
@@ -448,12 +450,13 @@ def test_lstm(
     if saveFreq == -1:
         saveFreq = len(train[0])/batch_size
 
-    uidx = 0
-    estop = False
+    uidx = 0  # the number of update done
+    estop = False  # early stop
     start_time = time.clock()
     for eidx in xrange(max_epochs):
         n_samples = 0
 
+        # Get new shuffled index for the training set.
         kf = get_minibatches_idx(len(train[0]), len(train[0])/batch_size,
                                  shuffle=True)
 
@@ -462,10 +465,13 @@ def test_lstm(
             uidx += 1
             use_noise.set_value(1.)
 
+            # Select the random examples for this minibatch
             y = [train[1][t] for t in train_index]
-            x, mask, y = prepare_data([train[0][t]for t in train_index],
-                                      y, maxlen=maxlen)
+            x = [train[0][t]for t in train_index]
 
+            # Get the data in numpy.ndarray formet.
+            # It return something of the shape (minibatch maxlen, n samples)
+            x, mask, y = prepare_data(x, y, maxlen=maxlen)
             if x is None:
                 print 'Minibatch with zero sample under length ', maxlen
                 continue

From 3d9b1ac652a9e26fefd2219644c3cca94cbed42d Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Mon, 12 Jan 2015 15:42:08 -0500
Subject: [PATCH 18/27] fix the display of the number of example seen

---
 code/lstm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/lstm.py b/code/lstm.py
index 200b359f..e91fbd84 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -461,7 +461,6 @@ def test_lstm(
                                  shuffle=True)
 
         for _, train_index in kf:
-            n_samples += train_index.shape[0]
             uidx += 1
             use_noise.set_value(1.)
 
@@ -475,6 +474,7 @@ def test_lstm(
             if x is None:
                 print 'Minibatch with zero sample under length ', maxlen
                 continue
+            n_samples += x.shape[1]
 
             cost = f_grad_shared(x, mask, y)
             f_update(lrate)

From 401a99a00ba77c70bdb5ff04e665a7f9978622c9 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Tue, 13 Jan 2015 08:40:34 -0500
Subject: [PATCH 19/27] Add a way to reload pretrained model

---
 code/lstm.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/code/lstm.py b/code/lstm.py
index e91fbd84..8a7e42fb 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -80,6 +80,9 @@ def _p(pp, name):
 
 
 def init_params(options):
+    """
+    Global (not LSTM) parameter. For the embeding and the classifier.
+    """
     params = OrderedDict()
     # embedding
     randn = numpy.random.rand(options['n_words'],
@@ -125,6 +128,11 @@ def ortho_weight(ndim):
 
 
 def param_init_lstm(options, params, prefix='lstm'):
+    """
+    Init the LSTM parameter:
+
+    :see: init_params
+    """
     W = numpy.concatenate([ortho_weight(options['dim_proj']),
                            ortho_weight(options['dim_proj']),
                            ortho_weight(options['dim_proj']),
@@ -388,6 +396,7 @@ def test_lstm(
     noise_std=0.,
     use_dropout=True,  # if False slightly faster, but worst test error
                        # This frequently need a bigger model.
+    reload_model="",  # Path to a saved model we want to start from.
 ):
 
     # Model options
@@ -407,6 +416,9 @@ def test_lstm(
     # Dict name (string) -> numpy ndarray
     params = init_params(model_options)
 
+    if reload_model:
+        load_params('lstm_model.npz', params)
+
     # This create Theano Shared Variable from the parameters.
     # Dict name (string) -> Theano Tensor Shared Variable
     # params and tparams have different copy of the weights.
@@ -561,4 +573,7 @@ def test_lstm(
     theano.config.scan.allow_gc = False
 
     # See function train for all possible parameter and there definition.
-    test_lstm(max_epochs=10)
+    test_lstm(
+        #reload_model="lstm_model.npz",
+        max_epochs=10,
+    )

From 6b7b7a6cafdd56bd541a3fc14b9fac10a0380600 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Tue, 13 Jan 2015 08:41:35 -0500
Subject: [PATCH 20/27] use adadelta, sgd do not work.

---
 code/lstm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 8a7e42fb..ba733c60 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -375,14 +375,14 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
 
 
 def test_lstm(
-    dim_proj=128,  # TODO: What is this
+    dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
     patience=10,  # number of epoch to wait before early stop if no progress
     max_epochs=5000,  # The maximum number of epoch to run
     dispFreq=10,  # display to stdout the training progress every N updates
     decay_c=0.,  # weight decay for the classifier applied to the U weights.
     lrate=0.0001,  # learning rate for sgd (not used for adadelta and rmsprop)
     n_words=10000,  # vocabulary size
-    optimizer=sgd,  # sgd, adadelta and rmsprop available
+    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decay learning rate).
     encoder='lstm',  # TODO: can be removed must be lstm.
     saveto='lstm_model.npz',  # The best model will be saved there
     validFreq=10000,  # after 1000

From 194adadb94c5571c12b5ad7de903612f1ff26968 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Tue, 13 Jan 2015 09:29:05 -0500
Subject: [PATCH 21/27] Add the script that created the preprocessed imdb
 dataset

---
 code/imdb_preprocess.py | 123 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 code/imdb_preprocess.py

diff --git a/code/imdb_preprocess.py b/code/imdb_preprocess.py
new file mode 100644
index 00000000..c20b37b6
--- /dev/null
+++ b/code/imdb_preprocess.py
@@ -0,0 +1,123 @@
+"""
+This script is what created the dataset pickled.
+
+1) You need to download this file and put it in the same directory as this file.
+https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
+
+2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
+
+3) Then run this script.
+"""
+
+dataset_path='/Tmp/bastienf/aclImdb/'
+
+import numpy
+import cPickle as pkl
+
+from collections import OrderedDict
+
+import glob
+import os
+
+from subprocess import Popen, PIPE
+
+# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
+tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
+
+
+def tokenize(sentences):
+
+    print 'Tokenizing..',
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    print 'Done'
+
+    return toks
+
+
+def build_dict(path):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir('%s/pos/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir('%s/neg/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+
+    sentences = tokenize(sentences)
+
+    print 'Building dictionary..',
+    wordcount = dict()
+    for ss in sentences:
+        words = ss.strip().lower().split()
+        for w in words:
+            if w not in wordcount:
+                wordcount[w] = 1
+            else:
+                wordcount[w] += 1
+
+    counts = wordcount.values()
+    keys = wordcount.keys()
+
+    sorted_idx = numpy.argsort(counts)[::-1]
+
+    worddict = dict()
+
+    for idx, ss in enumerate(sorted_idx):
+        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
+
+    print numpy.sum(counts), ' total words ', len(keys), ' unique words'
+
+    return worddict
+
+
+def grab_data(path, dictionary):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir(path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+    sentences = tokenize(sentences)
+
+    seqs = [None] * len(sentences)
+    for idx, ss in enumerate(sentences):
+        words = ss.strip().lower().split()
+        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
+
+    return seqs
+
+
+def main():
+    # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
+    path = dataset_path
+    dictionary = build_dict(os.path.join(path, 'train'))
+
+    train_x_pos = grab_data(path+'train/pos', dictionary)
+    train_x_neg = grab_data(path+'train/neg', dictionary)
+    train_x = train_x_pos + train_x_neg
+    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
+
+    test_x_pos = grab_data(path+'test/pos', dictionary)
+    test_x_neg = grab_data(path+'test/neg', dictionary)
+    test_x = test_x_pos + test_x_neg
+    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
+
+    f = open('imdb.pkl', 'wb')
+    pkl.dump((train_x, train_y), f, -1)
+    pkl.dump((test_x, test_y), f, -1)
+    f.close()
+
+    f = open('imdb.dict.pkl', 'wb')
+    pkl.dump(dictionary, f, -1)
+    f.close()
+
+if __name__ == '__main__':
+    main()

From c6fdcff288103b707f640c3f19f51b50eb5ea9ab Mon Sep 17 00:00:00 2001
From: Pierre Luc Carrier <carriepl@bart2.iro.umontreal.ca>
Date: Tue, 13 Jan 2015 11:10:46 -0500
Subject: [PATCH 22/27] Fixed function get_minibatches_idx()

---
 code/lstm.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index ba733c60..1946c0ad 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -18,7 +18,7 @@
 datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
 
 
-def get_minibatches_idx(n, nb_batches, shuffle=False):
+def get_minibatches_idx(n, minibatch_size, shuffle=False):
     """
     Used to shuffle the dataset at each iteration.
     """
@@ -30,17 +30,16 @@ def get_minibatches_idx(n, nb_batches, shuffle=False):
 
     minibatches = []
     minibatch_start = 0
-    for i in range(nb_batches):
-        if i < n % nb_batches:
-            minibatch_size = n // nb_batches + 1
-        else:
-            minibatch_size = n // nb_batches
-
+    for i in range(n // minibatch_size):
         minibatches.append(idx_list[minibatch_start:
                                     minibatch_start + minibatch_size])
         minibatch_start += minibatch_size
 
-    return zip(range(nb_batches), minibatches)
+    if (minibatch_start != n):
+        # Make a minibatch out of what is left
+        minibatches.append(idx_list[minibatch_start:])
+
+    return zip(range(len(minibatches)), minibatches)
 
 
 def get_dataset(name):
@@ -446,11 +445,9 @@ def test_lstm(
 
     print 'Optimization'
 
-    kf_valid = get_minibatches_idx(len(valid[0]),
-                                   len(valid[0]) / valid_batch_size,
+    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
                                    shuffle=True)
-    kf_test = get_minibatches_idx(len(test[0]),
-                                  len(test[0]) / valid_batch_size,
+    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
                                   shuffle=True)
 
     history_errs = []
@@ -469,8 +466,7 @@ def test_lstm(
         n_samples = 0
 
         # Get new shuffled index for the training set.
-        kf = get_minibatches_idx(len(train[0]), len(train[0])/batch_size,
-                                 shuffle=True)
+        kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
 
         for _, train_index in kf:
             uidx += 1

From 04c02d4f889a989a5db6ab51bea81de39d8baa65 Mon Sep 17 00:00:00 2001
From: Pierre Luc Carrier <carriepl@bart2.iro.umontreal.ca>
Date: Tue, 13 Jan 2015 11:27:43 -0500
Subject: [PATCH 23/27] Fixed default dataset value in load_data()

---
 code/imdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/imdb.py b/code/imdb.py
index c33884d6..085ab3f9 100644
--- a/code/imdb.py
+++ b/code/imdb.py
@@ -87,7 +87,7 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
 
     # Load the dataset
     path = get_dataset_file(
-        path, "imdb.pkl.gz",
+        path, "imdb.pkl",
         "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
 
     if path.endswith(".gz"):

From 74b2e0c75e5a1be863c890510898ea42d2cf14fd Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Tue, 13 Jan 2015 10:34:43 -0500
Subject: [PATCH 24/27] Filter for the max seq len when we load the dataset

---
 code/imdb.py | 23 ++++++++++++++++++++---
 code/lstm.py | 32 +++++++++++++++++---------------
 2 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/code/imdb.py b/code/imdb.py
index 085ab3f9..c9d150e2 100644
--- a/code/imdb.py
+++ b/code/imdb.py
@@ -74,11 +74,19 @@ def get_dataset_file(dataset, default_dataset, origin):
     return dataset
 
 
-def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
+def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
     ''' Loads the dataset
 
-    :type dataset: string
-    :param dataset: the path to the dataset (here IMDB)
+    :type path: String
+    :param path: The path to the dataset (here IMDB)
+    :type n_words: int
+    :param n_words: The number of word to keep in the vocabulary.
+        All extra words are set to unknow (1).
+    :type valid_portion: float
+    :param valid_portion: The proportion of the full train set used for
+        the validation set.
+    :type maxlen: None or positive int
+    :param maxlen: the max sequence length we use in the train/valid set.
     '''
 
     #############
@@ -98,6 +106,15 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
     train_set = cPickle.load(f)
     test_set = cPickle.load(f)
     f.close()
+    if maxlen:
+        new_train_set_x = []
+        new_train_set_y = []
+        for x, y in zip(train_set[0], train_set[1]):
+            if len(x) < maxlen:
+                new_train_set_x.append(x)
+                new_train_set_y.append(y)
+        train_set = (new_train_set_x, new_train_set_y)
+        del new_train_set_x, new_train_set_y
 
     # split training set into validation set
     train_set_x, train_set_y = train_set
diff --git a/code/lstm.py b/code/lstm.py
index 1946c0ad..995c91a8 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -373,22 +373,22 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
     return valid_err
 
 
-def test_lstm(
+def train_lstm(
     dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
-    patience=10,  # number of epoch to wait before early stop if no progress
+    patience=10,  # Number of epoch to wait before early stop if no progress
     max_epochs=5000,  # The maximum number of epoch to run
-    dispFreq=10,  # display to stdout the training progress every N updates
-    decay_c=0.,  # weight decay for the classifier applied to the U weights.
-    lrate=0.0001,  # learning rate for sgd (not used for adadelta and rmsprop)
-    n_words=10000,  # vocabulary size
-    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decay learning rate).
+    dispFreq=10,  # Display to stdout the training progress every N updates
+    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
+    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
+    n_words=10000,  # Vocabulary size
+    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
     encoder='lstm',  # TODO: can be removed must be lstm.
     saveto='lstm_model.npz',  # The best model will be saved there
-    validFreq=10000,  # after 1000
-    saveFreq=100000,  # save the parameters after every saveFreq updates
-    maxlen=100,  # longer sequence get ignored
-    batch_size=64,  # the batch size during training.
-    valid_batch_size=64,  # The batch size during validation
+    validFreq=390,  # Compute the validation error after this number of update.
+    saveFreq=1040,  # Save the parameters after every saveFreq updates
+    maxlen=100,  # Sequence longer then this get ignored
+    batch_size=16,  # The batch size during training.
+    valid_batch_size=64,  # The batch size used for validation/test set.
     dataset='imdb',
 
     # Parameter for extra option
@@ -400,11 +400,13 @@ def test_lstm(
 
     # Model options
     model_options = locals().copy()
+    print "model options", model_options
 
     load_data, prepare_data = get_dataset(dataset)
 
     print 'Loading data'
-    train, valid, test = load_data(n_words=n_words, valid_portion=0.01)
+    train, valid, test = load_data(n_words=n_words, valid_portion=0.01,
+                                   maxlen=maxlen)
 
     ydim = numpy.max(train[1])+1
 
@@ -569,7 +571,7 @@ def test_lstm(
     theano.config.scan.allow_gc = False
 
     # See function train for all possible parameter and there definition.
-    test_lstm(
+    train_lstm(
         #reload_model="lstm_model.npz",
-        max_epochs=10,
+        max_epochs=100,
     )

From 43adeff4754cdd15340bb4cf8a7e53c88af58a9f Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Tue, 13 Jan 2015 10:36:47 -0500
Subject: [PATCH 25/27] use an higher valid proportion, to make it move.

---
 code/lstm.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/code/lstm.py b/code/lstm.py
index 995c91a8..6762ef93 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -405,7 +405,7 @@ def train_lstm(
     load_data, prepare_data = get_dataset(dataset)
 
     print 'Loading data'
-    train, valid, test = load_data(n_words=n_words, valid_portion=0.01,
+    train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
                                    maxlen=maxlen)
 
     ydim = numpy.max(train[1])+1
@@ -452,6 +452,9 @@ def train_lstm(
     kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
                                   shuffle=True)
 
+    print "%d train examples" % len(train[0])
+    print "%d valid examples" % len(valid[0])
+    print "%d test examples" % len(test[0])
     history_errs = []
     best_p = None
     bad_count = 0

From 2da912206d3d9987d3d2d48b0d647405273be6d4 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Tue, 13 Jan 2015 10:43:40 -0500
Subject: [PATCH 26/27] catch ctrl-C

---
 code/lstm.py | 157 ++++++++++++++++++++++++++-------------------------
 1 file changed, 81 insertions(+), 76 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 6762ef93..431c962f 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -384,8 +384,8 @@ def train_lstm(
     optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
     encoder='lstm',  # TODO: can be removed must be lstm.
     saveto='lstm_model.npz',  # The best model will be saved there
-    validFreq=390,  # Compute the validation error after this number of update.
-    saveFreq=1040,  # Save the parameters after every saveFreq updates
+    validFreq=370,  # Compute the validation error after this number of update.
+    saveFreq=1110,  # Save the parameters after every saveFreq updates
     maxlen=100,  # Sequence longer then this get ignored
     batch_size=16,  # The batch size during training.
     valid_batch_size=64,  # The batch size used for validation/test set.
@@ -467,80 +467,85 @@ def train_lstm(
     uidx = 0  # the number of update done
     estop = False  # early stop
     start_time = time.clock()
-    for eidx in xrange(max_epochs):
-        n_samples = 0
-
-        # Get new shuffled index for the training set.
-        kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
-
-        for _, train_index in kf:
-            uidx += 1
-            use_noise.set_value(1.)
-
-            # Select the random examples for this minibatch
-            y = [train[1][t] for t in train_index]
-            x = [train[0][t]for t in train_index]
-
-            # Get the data in numpy.ndarray formet.
-            # It return something of the shape (minibatch maxlen, n samples)
-            x, mask, y = prepare_data(x, y, maxlen=maxlen)
-            if x is None:
-                print 'Minibatch with zero sample under length ', maxlen
-                continue
-            n_samples += x.shape[1]
-
-            cost = f_grad_shared(x, mask, y)
-            f_update(lrate)
-
-            if numpy.isnan(cost) or numpy.isinf(cost):
-                print 'NaN detected'
-                return 1., 1., 1.
-
-            if numpy.mod(uidx, dispFreq) == 0:
-                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
-
-            if numpy.mod(uidx, saveFreq) == 0:
-                print 'Saving...',
-
-                if best_p is not None:
-                    params = best_p
-                else:
-                    params = unzip(tparams)
-                numpy.savez(saveto, history_errs=history_errs, **params)
-                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
-                print 'Done'
-
-            if numpy.mod(uidx, validFreq) == 0:
-                use_noise.set_value(0.)
-                train_err = pred_error(f_pred, prepare_data, train, kf)
-                valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
-                test_err = pred_error(f_pred, prepare_data, test, kf_test)
-
-                history_errs.append([valid_err, test_err])
-
-                if (uidx == 0 or
-                    valid_err <= numpy.array(history_errs)[:,
-                                                           0].min()):
-
-                    best_p = unzip(tparams)
-                    bad_counter = 0
-
-                print ('Train ', train_err, 'Valid ', valid_err,
-                       'Test ', test_err)
-
-                if (len(history_errs) > patience and
-                    valid_err >= numpy.array(history_errs)[:-patience,
-                                                           0].min()):
-                    bad_counter += 1
-                    if bad_counter > patience:
-                        print 'Early Stop!'
-                        estop = True
-                        break
-
-        print 'Seen %d samples' % n_samples
-
-        if estop:
-            break
+    try:
+        for eidx in xrange(max_epochs):
+            n_samples = 0
+
+            # Get new shuffled index for the training set.
+            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
+
+            for _, train_index in kf:
+                uidx += 1
+                use_noise.set_value(1.)
+
+                # Select the random examples for this minibatch
+                y = [train[1][t] for t in train_index]
+                x = [train[0][t]for t in train_index]
+
+                # Get the data in numpy.ndarray formet.
+                # It return something of the shape (minibatch maxlen, n samples)
+                x, mask, y = prepare_data(x, y, maxlen=maxlen)
+                if x is None:
+                    print 'Minibatch with zero sample under length ', maxlen
+                    continue
+                n_samples += x.shape[1]
+
+                cost = f_grad_shared(x, mask, y)
+                f_update(lrate)
+
+                if numpy.isnan(cost) or numpy.isinf(cost):
+                    print 'NaN detected'
+                    return 1., 1., 1.
+
+                if numpy.mod(uidx, dispFreq) == 0:
+                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
+
+                if numpy.mod(uidx, saveFreq) == 0:
+                    print 'Saving...',
+
+                    if best_p is not None:
+                        params = best_p
+                    else:
+                        params = unzip(tparams)
+                    numpy.savez(saveto, history_errs=history_errs, **params)
+                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
+                    print 'Done'
+
+                if numpy.mod(uidx, validFreq) == 0:
+                    use_noise.set_value(0.)
+                    train_err = pred_error(f_pred, prepare_data, train, kf)
+                    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
+                    test_err = pred_error(f_pred, prepare_data, test, kf_test)
+
+                    history_errs.append([valid_err, test_err])
+
+                    if (uidx == 0 or
+                        valid_err <= numpy.array(history_errs)[:,
+                                                               0].min()):
+
+                        best_p = unzip(tparams)
+                        bad_counter = 0
+
+                    print ('Train ', train_err, 'Valid ', valid_err,
+                           'Test ', test_err)
+
+                    if (len(history_errs) > patience and
+                        valid_err >= numpy.array(history_errs)[:-patience,
+                                                               0].min()):
+                        bad_counter += 1
+                        if bad_counter > patience:
+                            print 'Early Stop!'
+                            estop = True
+                            break
+
+            print 'Seen %d samples' % n_samples
+
+            if estop:
+                break
+
+    except KeyboardInterrupt:
+        print "Training interupted"
+
     end_time = time.clock()
     if best_p is not None:
         zipp(best_p, tparams)

From 5482b180e327165168eb62a47c06e20c7d7425c4 Mon Sep 17 00:00:00 2001
From: Frederic <nouiz@nouiz.org>
Date: Tue, 13 Jan 2015 10:44:01 -0500
Subject: [PATCH 27/27] small clean up

---
 code/lstm.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/code/lstm.py b/code/lstm.py
index 431c962f..00279ce0 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -559,11 +559,9 @@ def train_lstm(
 
     print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
 
-    params = copy.copy(best_p)
-    numpy.savez(saveto, zipped_params=best_p, train_err=train_err,
+    numpy.savez(saveto, train_err=train_err,
                 valid_err=valid_err, test_err=test_err,
-                history_errs=history_errs, **params)
-
+                history_errs=history_errs, **best_p)
     print 'The code run for %d epochs, with %f sec/epochs' % (
         (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
     print >> sys.stderr, ('Training took %.1fs' %