diff --git a/code/imdb.py b/code/imdb.py
index f98c9601..c9d150e2 100644
--- a/code/imdb.py
+++ b/code/imdb.py
@@ -1,16 +1,22 @@
 import cPickle
 import gzip
 import os
-import sys
-import time
 
 import numpy
 
 import theano
-import theano.tensor as T
 
 
 def prepare_data(seqs, labels, maxlen=None):
+    """Create the matrices from the datasets.
+
+    This pad each sequence to the same lenght: the lenght of the
+    longuest sequence or maxlen.
+
+    if maxlen is set, we will cut all sequence to this maximum
+    lenght.
+
+    """
     # x: a list of sentences
     lengths = [len(s) for s in seqs]
 
@@ -42,24 +48,73 @@ def prepare_data(seqs, labels, maxlen=None):
     return x, x_mask, labels
 
 
-def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
+def get_dataset_file(dataset, default_dataset, origin):
+    '''Look for it as if it was a full path, if not, try local file,
+    if not try in the data directory.
+
+    Download dataset if it is not present
+
+    '''
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == default_dataset:
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == default_dataset:
+        import urllib
+        print 'Downloading data from %s' % origin
+        urllib.urlretrieve(origin, dataset)
+    return dataset
+
+
+def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
     ''' Loads the dataset
 
-    :type dataset: string
-    :param dataset: the path to the dataset (here IMDB)
+    :type path: String
+    :param path: The path to the dataset (here IMDB)
+    :type n_words: int
+    :param n_words: The number of word to keep in the vocabulary.
+        All extra words are set to unknow (1).
+    :type valid_portion: float
+    :param valid_portion: The proportion of the full train set used for
+        the validation set.
+    :type maxlen: None or positive int
+    :param maxlen: the max sequence length we use in the train/valid set.
     '''
 
     #############
     # LOAD DATA #
     #############
 
-    print '... loading data'
-
     # Load the dataset
-    f = open(path, 'rb')
+    path = get_dataset_file(
+        path, "imdb.pkl",
+        "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
+
+    if path.endswith(".gz"):
+        f = gzip.open(path, 'rb')
+    else:
+        f = open(path, 'rb')
+
     train_set = cPickle.load(f)
     test_set = cPickle.load(f)
     f.close()
+    if maxlen:
+        new_train_set_x = []
+        new_train_set_y = []
+        for x, y in zip(train_set[0], train_set[1]):
+            if len(x) < maxlen:
+                new_train_set_x.append(x)
+                new_train_set_y.append(y)
+        train_set = (new_train_set_x, new_train_set_y)
+        del new_train_set_x, new_train_set_y
 
     # split training set into validation set
     train_set_x, train_set_y = train_set
diff --git a/code/imdb_preprocess.py b/code/imdb_preprocess.py
new file mode 100644
index 00000000..c20b37b6
--- /dev/null
+++ b/code/imdb_preprocess.py
@@ -0,0 +1,123 @@
+"""
+This script is what created the dataset pickled.
+
+1) You need to download this file and put it in the same directory as this file.
+https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
+
+2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
+
+3) Then run this script.
+"""
+
+dataset_path='/Tmp/bastienf/aclImdb/'
+
+import numpy
+import cPickle as pkl
+
+from collections import OrderedDict
+
+import glob
+import os
+
+from subprocess import Popen, PIPE
+
+# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
+tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
+
+
+def tokenize(sentences):
+
+    print 'Tokenizing..',
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    print 'Done'
+
+    return toks
+
+
+def build_dict(path):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir('%s/pos/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir('%s/neg/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+
+    sentences = tokenize(sentences)
+
+    print 'Building dictionary..',
+    wordcount = dict()
+    for ss in sentences:
+        words = ss.strip().lower().split()
+        for w in words:
+            if w not in wordcount:
+                wordcount[w] = 1
+            else:
+                wordcount[w] += 1
+
+    counts = wordcount.values()
+    keys = wordcount.keys()
+
+    sorted_idx = numpy.argsort(counts)[::-1]
+
+    worddict = dict()
+
+    for idx, ss in enumerate(sorted_idx):
+        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
+
+    print numpy.sum(counts), ' total words ', len(keys), ' unique words'
+
+    return worddict
+
+
+def grab_data(path, dictionary):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir(path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+    sentences = tokenize(sentences)
+
+    seqs = [None] * len(sentences)
+    for idx, ss in enumerate(sentences):
+        words = ss.strip().lower().split()
+        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
+
+    return seqs
+
+
+def main():
+    # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
+    path = dataset_path
+    dictionary = build_dict(os.path.join(path, 'train'))
+
+    train_x_pos = grab_data(path+'train/pos', dictionary)
+    train_x_neg = grab_data(path+'train/neg', dictionary)
+    train_x = train_x_pos + train_x_neg
+    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
+
+    test_x_pos = grab_data(path+'test/pos', dictionary)
+    test_x_neg = grab_data(path+'test/neg', dictionary)
+    test_x = test_x_pos + test_x_neg
+    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
+
+    f = open('imdb.pkl', 'wb')
+    pkl.dump((train_x, train_y), f, -1)
+    pkl.dump((test_x, test_y), f, -1)
+    f.close()
+
+    f = open('imdb.dict.pkl', 'wb')
+    pkl.dump(dictionary, f, -1)
+    f.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/code/lstm.py b/code/lstm.py
index 2b57d8be..00279ce0 100644
--- a/code/lstm.py
+++ b/code/lstm.py
@@ -1,23 +1,27 @@
 '''
 Build a tweet sentiment analyzer
 '''
-import theano
-import theano.tensor as tensor
-from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
-
-import cPickle as pkl
-import numpy
+from collections import OrderedDict
 import copy
+import cPickle as pkl
 import random
+import sys
+import time
 
-from collections import OrderedDict
+import numpy
+import theano
+import theano.tensor as tensor
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
 import imdb
 
 datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
 
 
-def get_minibatches_idx(n, nb_batches, shuffle=False):
+def get_minibatches_idx(n, minibatch_size, shuffle=False):
+    """
+    Used to shuffle the dataset at each iteration.
+    """
 
     idx_list = numpy.arange(n, dtype="int32")
 
@@ -26,17 +30,16 @@ def get_minibatches_idx(n, nb_batches, shuffle=False):
 
     minibatches = []
     minibatch_start = 0
-    for i in range(nb_batches):
-        if i < n % nb_batches:
-            minibatch_size = n // nb_batches + 1
-        else:
-            minibatch_size = n // nb_batches
-
+    for i in range(n // minibatch_size):
         minibatches.append(idx_list[minibatch_start:
                                     minibatch_start + minibatch_size])
         minibatch_start += minibatch_size
 
-    return zip(range(nb_batches), minibatches)
+    if (minibatch_start != n):
+        # Make a minibatch out of what is left
+        minibatches.append(idx_list[minibatch_start:])
+
+    return zip(range(len(minibatches)), minibatches)
 
 
 def get_dataset(name):
@@ -44,21 +47,23 @@ def get_dataset(name):
 
 
 def zipp(params, tparams):
+    """
+    When we reload the model. Needed for the GPU stuff.
+    """
     for kk, vv in params.iteritems():
         tparams[kk].set_value(vv)
 
 
 def unzip(zipped):
+    """
+    When we pickle the model. Needed for the GPU stuff.
+    """
     new_params = OrderedDict()
     for kk, vv in zipped.iteritems():
         new_params[kk] = vv.get_value()
     return new_params
 
 
-def itemlist(tparams):
-    return [vv for kk, vv in tparams.iteritems()]
-
-
 def dropout_layer(state_before, use_noise, trng):
     proj = tensor.switch(use_noise,
                          (state_before *
@@ -74,12 +79,14 @@ def _p(pp, name):
 
 
 def init_params(options):
+    """
+    Global (not LSTM) parameter. For the embeding and the classifier.
+    """
     params = OrderedDict()
     # embedding
     randn = numpy.random.rand(options['n_words'],
                               options['dim_proj'])
     params['Wemb'] = (0.01 * randn).astype('float32')
-    # rconv
     params = get_layer(options['encoder'])[0](options,
                                               params,
                                               prefix=options['encoder'])
@@ -107,29 +114,10 @@ def init_tparams(params):
         tparams[kk] = theano.shared(params[kk], name=kk)
     return tparams
 
-layers = {'ff': ('param_init_fflayer', 'fflayer'),
-          'rconv': ('param_init_rconv', 'rconv_layer'),
-          'lstm': ('param_init_lstm', 'lstm_layer')}
-
 
 def get_layer(name):
     fns = layers[name]
-    return (eval(fns[0]), eval(fns[1]))
-
-
-def param_init_fflayer(options, params, prefix='ff'):
-    weights = numpy.random.randn(options['dim_proj'], options['dim_proj'])
-    biases = numpy.zeros((options['dim_proj'], ))
-    params[_p(prefix, 'W')] = 0.01 * weights.astype('float32')
-    params[_p(prefix, 'b')] = biases.astype('float32')
-
-    return params
-
-
-def fflayer(tparams, state_below, options, prefix='rconv', **kwargs):
-    pre_act = (tensor.dot(state_below,
-                          tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
-    return eval(options['activ'])(pre_act)
+    return fns
 
 
 def ortho_weight(ndim):
@@ -139,6 +127,11 @@ def ortho_weight(ndim):
 
 
 def param_init_lstm(options, params, prefix='lstm'):
+    """
+    Init the LSTM parameter:
+
+    :see: init_params
+    """
     W = numpy.concatenate([ortho_weight(options['dim_proj']),
                            ortho_weight(options['dim_proj']),
                            ortho_weight(options['dim_proj']),
@@ -202,86 +195,37 @@ def _step(m_, x_, h_, c_):
     return rval[0]
 
 
-def param_init_rconv(options, params, prefix='rconv'):
-    params[_p(prefix, 'W')] = ortho_weight(options['dim_proj'])
-    params[_p(prefix, 'U')] = ortho_weight(options['dim_proj'])
-    b = numpy.zeros((options['dim_proj'],)).astype('float32')
-    params[_p(prefix, 'b')] = b
-    gw = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
-    params[_p(prefix, 'GW')] = gw
-    gu = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
-    params[_p(prefix, 'GU')] = gu
-    params[_p(prefix, 'Gb')] = numpy.zeros((3,)).astype('float32')
-
-    return params
+# ff: Feed Forward (normal neural net), only useful to put after lstm
+#     before the classifier.
+layers = {'lstm': (param_init_lstm, lstm_layer)}
 
 
-def rconv_layer(tparams, state_below, options, prefix='rconv', mask=None):
-    nsteps = state_below.shape[0]
+def sgd(lr, tparams, grads, x, mask, y, cost):
+    """ Stochastic Gradient Descent
 
-    assert mask is not None
+    :note: A more complicated version of sgd then needed.  This is
+        done like that for adadelta and rmsprop.
 
-    def _step(m_, p_):
-        l_ = p_
-        # new activation
-        ps_ = tensor.zeros_like(p_)
-        ps_ = tensor.set_subtensor(ps_[1:], p_[:-1])
-        ls_ = ps_
-        ps_ = tensor.dot(ps_, tparams[_p(prefix, 'U')])
-        pl_ = tensor.dot(p_, tparams[_p(prefix, 'W')])
-        newact = eval(options['activ'])(ps_+pl_+tparams[_p(prefix, 'b')])
-
-        # gater
-        gt_ = (tensor.dot(ls_, tparams[_p(prefix, 'GU')]) +
-               tensor.dot(l_, tparams[_p(prefix, 'GW')]) +
-               tparams[_p(prefix, 'Gb')])
-        if l_.ndim == 3:
-            gt_shp = gt_.shape
-            gt_ = gt_.reshape((gt_shp[0] * gt_shp[1], gt_shp[2]))
-        gt_ = tensor.nnet.softmax(gt_)
-        if l_.ndim == 3:
-            gt_ = gt_.reshape((gt_shp[0], gt_shp[1], gt_shp[2]))
-
-        if p_.ndim == 3:
-            gn = gt_[:, :, 0].dimshuffle(0, 1, 'x')
-            gl = gt_[:, :, 1].dimshuffle(0, 1, 'x')
-            gr = gt_[:, :, 2].dimshuffle(0, 1, 'x')
-        else:
-            gn = gt_[:, 0].dimshuffle(0, 'x')
-            gl = gt_[:, 1].dimshuffle(0, 'x')
-            gr = gt_[:, 2].dimshuffle(0, 'x')
-
-        act = newact * gn + ls_ * gl + l_ * gr
-
-        if p_.ndim == 3:
-            m_ = m_.dimshuffle('x', 0, 'x')
-        else:
-            m_ = m_.dimshuffle('x', 0)
-        return tensor.switch(m_, act, l_)
+    """
+    # New set of shared variable that will contain the gradient
+    # for a mini-batch.
+    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
+               for k, p in tparams.iteritems()]
+    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
 
-    rval, updates = theano.scan(_step,
-                                sequences=[mask[1:]],
-                                outputs_info=[state_below],
-                                name='layer_%s' % prefix,
-                                n_steps=nsteps-1)
+    # Function that computes gradients for a mini-batch, but do not
+    # updates the weights.
+    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
+                                    name='sgd_f_grad_shared')
 
-    seqlens = tensor.cast(mask.sum(axis=0), 'int64')-1
-    roots = rval[-1]
+    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
 
-    if state_below.ndim == 3:
-        def _grab_root(seqlen, one_sample, prev_sample):
-            return one_sample[seqlen]
-
-        dim_proj = options['dim_proj']
-        roots, updates = theano.scan(_grab_root,
-                                     sequences=[seqlens,
-                                                roots.dimshuffle(1, 0, 2)],
-                                     outputs_info=[tensor.alloc(0., dim_proj)],
-                                     name='grab_root_%s' % prefix)
-    else:
-        roots = roots[seqlens]  # there should be only one, so it's fine.
+    # Function that updates the weights from the previously computed
+    # gradient.
+    f_update = theano.function([lr], [], updates=pup,
+                               name='sgd_f_update')
 
-    return roots
+    return f_grad_shared, f_update
 
 
 def adadelta(lr, tparams, grads, x, mask, y, cost):
@@ -299,7 +243,8 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
              for rg2, g in zip(running_grads2, grads)]
 
-    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up)
+    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up,
+                                    name='adadelta_f_grad_shared')
 
     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
              for zg, ru2, rg2 in zip(zipped_grads,
@@ -307,10 +252,11 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
                                      running_grads2)]
     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
              for ru2, ud in zip(running_up2, updir)]
-    param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
+    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
 
     f_update = theano.function([lr], [], updates=ru2up+param_up,
-                               on_unused_input='ignore')
+                               on_unused_input='ignore',
+                               name='adadelta_f_update')
 
     return f_grad_shared, f_update
 
@@ -332,7 +278,8 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
              for rg2, g in zip(running_grads2, grads)]
 
     f_grad_shared = theano.function([x, mask, y], cost,
-                                    updates=zgup + rgup + rg2up)
+                                    updates=zgup + rgup + rg2up,
+                                    name='rmsprop_f_grad_shared')
 
     updir = [theano.shared(p.get_value() * numpy.float32(0.),
                            name='%s_updir' % k)
@@ -341,28 +288,18 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                             running_grads2)]
     param_up = [(p, p + udn[1])
-                for p, udn in zip(itemlist(tparams), updir_new)]
+                for p, udn in zip(tparams.values(), updir_new)]
     f_update = theano.function([lr], [], updates=updir_new+param_up,
-                               on_unused_input='ignore')
-
-    return f_grad_shared, f_update
-
-
-def sgd(lr, tparams, grads, x, mask, y, cost):
-    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
-               for k, p in tparams.iteritems()]
-    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
-
-    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup)
-
-    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
-    f_update = theano.function([lr], [], updates=pup)
+                               on_unused_input='ignore',
+                               name='rmsprop_f_update')
 
     return f_grad_shared, f_update
 
 
 def build_model(tparams, options):
     trng = RandomStreams(1234)
+
+    # Used for dropout.
     use_noise = theano.shared(numpy.float32(0.))
 
     x = tensor.matrix('x', dtype='int64')
@@ -386,15 +323,18 @@ def build_model(tparams, options):
 
     pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b'])
 
-    f_pred_prob = theano.function([x, mask], pred)
-    f_pred = theano.function([x, mask], pred.argmax(axis=1))
+    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
+    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
 
     cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
 
-    return trng, use_noise, x, mask, y, f_pred_prob, f_pred, cost
+    return use_noise, x, mask, y, f_pred_prob, f_pred, cost
 
 
 def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
+    """ If you want to use a trained model, this is useful to compute
+    the probabilities of new examples.
+    """
     n_samples = len(data[0])
     probs = numpy.zeros((n_samples, 2)).astype('float32')
 
@@ -415,6 +355,11 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
 
 
 def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
+    """
+    Just compute the error
+    f_pred: Theano fct computing the prediction
+    prepare_data: usual prepare_data for that dataset.
+    """
     valid_err = 0
     for _, valid_index in iterator:
         x, mask, y = prepare_data([data[0][t] for t in valid_index],
@@ -428,44 +373,60 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
     return valid_err
 
 
-def train(dim_proj=100,
-          patience=10,
-          max_epochs=5000,
-          dispFreq=100,
-          activ='lambda x: tensor.tanh(x)',
-          decay_c=0.,
-          lrate=0.01,
-          n_words=100000,
-          data_sym=False,
-          optimizer='rmsprop',
-          encoder='rconv',
-          saveto='model.npz',
-          noise_std=0.,
-          validFreq=1000,
-          saveFreq=1000,  # save the parameters after every saveFreq updates
-          maxlen=50,
-          batch_size=16,
-          valid_batch_size=16,
-          dataset='sentiment140',
-          use_dropout=False):
+def train_lstm(
+    dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
+    patience=10,  # Number of epoch to wait before early stop if no progress
+    max_epochs=5000,  # The maximum number of epoch to run
+    dispFreq=10,  # Display to stdout the training progress every N updates
+    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
+    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
+    n_words=10000,  # Vocabulary size
+    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
+    encoder='lstm',  # TODO: can be removed must be lstm.
+    saveto='lstm_model.npz',  # The best model will be saved there
+    validFreq=370,  # Compute the validation error after this number of update.
+    saveFreq=1110,  # Save the parameters after every saveFreq updates
+    maxlen=100,  # Sequence longer then this get ignored
+    batch_size=16,  # The batch size during training.
+    valid_batch_size=64,  # The batch size used for validation/test set.
+    dataset='imdb',
+
+    # Parameter for extra option
+    noise_std=0.,
+    use_dropout=True,  # if False slightly faster, but worst test error
+                       # This frequently need a bigger model.
+    reload_model="",  # Path to a saved model we want to start from.
+):
 
     # Model options
     model_options = locals().copy()
+    print "model options", model_options
 
     load_data, prepare_data = get_dataset(dataset)
 
     print 'Loading data'
-    train, valid, test = load_data(n_words=n_words, valid_portion=0.01)
+    train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
+                                   maxlen=maxlen)
 
     ydim = numpy.max(train[1])+1
 
     model_options['ydim'] = ydim
 
     print 'Building model'
+    # This create the initial parameters as numpy ndarrays.
+    # Dict name (string) -> numpy ndarray
     params = init_params(model_options)
+
+    if reload_model:
+        load_params('lstm_model.npz', params)
+
+    # This create Theano Shared Variable from the parameters.
+    # Dict name (string) -> Theano Tensor Shared Variable
+    # params and tparams have different copy of the weights.
     tparams = init_tparams(params)
 
-    (trng, use_noise, x, mask,
+    # use_noise is for dropout
+    (use_noise, x, mask,
      y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
 
     if decay_c > 0.:
@@ -475,24 +436,25 @@ def train(dim_proj=100,
         weight_decay *= decay_c
         cost += weight_decay
 
-    f_cost = theano.function([x, mask, y], cost)
+    f_cost = theano.function([x, mask, y], cost, name='f_cost')
 
-    grads = tensor.grad(cost, wrt=itemlist(tparams))
-    f_grad = theano.function([x, mask, y], grads)
+    grads = tensor.grad(cost, wrt=tparams.values())
+    f_grad = theano.function([x, mask, y], grads, name='f_grad')
 
     lr = tensor.scalar(name='lr')
-    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads,
-                                              x, mask, y, cost)
+    f_grad_shared, f_update = optimizer(lr, tparams, grads,
+                                        x, mask, y, cost)
 
     print 'Optimization'
 
-    kf_valid = get_minibatches_idx(len(valid[0]),
-                                   len(valid[0]) / valid_batch_size,
+    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
                                    shuffle=True)
-    kf_test = get_minibatches_idx(len(test[0]),
-                                  len(test[0]) / valid_batch_size,
+    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
                                   shuffle=True)
 
+    print "%d train examples" % len(train[0])
+    print "%d valid examples" % len(valid[0])
+    print "%d test examples" % len(test[0])
     history_errs = []
     best_p = None
     bad_count = 0
@@ -502,81 +464,93 @@ def train(dim_proj=100,
     if saveFreq == -1:
         saveFreq = len(train[0])/batch_size
 
-    uidx = 0
-    estop = False
-    for eidx in xrange(max_epochs):
-        n_samples = 0
-
-        kf = get_minibatches_idx(len(train[0]), len(train[0])/batch_size,
-                                 shuffle=True)
-
-        for _, train_index in kf:
-            n_samples += train_index.shape[0]
-            uidx += 1
-            use_noise.set_value(1.)
-
-            y = [train[1][t] for t in train_index]
-            x, mask, y = prepare_data([train[0][t]for t in train_index],
-                                      y, maxlen=maxlen)
-
-            if x is None:
-                print 'Minibatch with zero sample under length ', maxlen
-                continue
-
-            cost = f_grad_shared(x, mask, y)
-            f_update(lrate)
-
-            if numpy.isnan(cost) or numpy.isinf(cost):
-                print 'NaN detected'
-                return 1., 1., 1.
-
-            if numpy.mod(uidx, dispFreq) == 0:
-                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
-
-            if numpy.mod(uidx, saveFreq) == 0:
-                print 'Saving...',
-
-                if best_p is not None:
-                    params = best_p
-                else:
-                    params = unzip(tparams)
-                numpy.savez(saveto, history_errs=history_errs, **params)
-                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
-                print 'Done'
-
-            if numpy.mod(uidx, validFreq) == 0:
-                use_noise.set_value(0.)
-                train_err = pred_error(f_pred, prepare_data, train, kf)
-                valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
-                test_err = pred_error(f_pred, prepare_data, test, kf_test)
-
-                history_errs.append([valid_err, test_err])
-
-                if (uidx == 0 or
-                    valid_err <= numpy.array(history_errs)[:,
-                                                           0].min()):
-
-                    best_p = unzip(tparams)
-                    bad_counter = 0
-                if (len(history_errs) > patience and
-                    valid_err >= numpy.array(history_errs)[:-patience,
-                                                           0].min()):
-                    bad_counter += 1
-                    if bad_counter > patience:
-                        print 'Early Stop!'
-                        estop = True
-                        break
-
-                print ('Train ', train_err, 'Valid ', valid_err,
-                       'Test ', test_err)
-
-        print 'Seen %d samples' % n_samples
-
-        if estop:
-            break
-
+    uidx = 0  # the number of update done
+    estop = False  # early stop
+    start_time = time.clock()
+    try:
+        for eidx in xrange(max_epochs):
+            n_samples = 0
+
+            # Get new shuffled index for the training set.
+            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
+
+            for _, train_index in kf:
+                uidx += 1
+                use_noise.set_value(1.)
+
+                # Select the random examples for this minibatch
+                y = [train[1][t] for t in train_index]
+                x = [train[0][t]for t in train_index]
+
+                # Get the data in numpy.ndarray formet.
+                # It return something of the shape (minibatch maxlen, n samples)
+                x, mask, y = prepare_data(x, y, maxlen=maxlen)
+                if x is None:
+                    print 'Minibatch with zero sample under length ', maxlen
+                    continue
+                n_samples += x.shape[1]
+
+                cost = f_grad_shared(x, mask, y)
+                f_update(lrate)
+
+                if numpy.isnan(cost) or numpy.isinf(cost):
+                    print 'NaN detected'
+                    return 1., 1., 1.
+
+                if numpy.mod(uidx, dispFreq) == 0:
+                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
+
+                if numpy.mod(uidx, saveFreq) == 0:
+                    print 'Saving...',
+
+                    if best_p is not None:
+                        params = best_p
+                    else:
+                        params = unzip(tparams)
+                    numpy.savez(saveto, history_errs=history_errs, **params)
+                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
+                    print 'Done'
+
+                if numpy.mod(uidx, validFreq) == 0:
+                    use_noise.set_value(0.)
+                    train_err = pred_error(f_pred, prepare_data, train, kf)
+                    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
+                    test_err = pred_error(f_pred, prepare_data, test, kf_test)
+
+                    history_errs.append([valid_err, test_err])
+
+                    if (uidx == 0 or
+                        valid_err <= numpy.array(history_errs)[:,
+                                                               0].min()):
+
+                        best_p = unzip(tparams)
+                        bad_counter = 0
+
+                    print ('Train ', train_err, 'Valid ', valid_err,
+                           'Test ', test_err)
+
+                    if (len(history_errs) > patience and
+                        valid_err >= numpy.array(history_errs)[:-patience,
+                                                               0].min()):
+                        bad_counter += 1
+                        if bad_counter > patience:
+                            print 'Early Stop!'
+                            estop = True
+                            break
+
+            print 'Seen %d samples' % n_samples
+
+            if estop:
+                break
+
+    except KeyboardInterrupt:
+        print "Training interupted"
+
+    end_time = time.clock()
     if best_p is not None:
         zipp(best_p, tparams)
+    else:
+        best_p = unzip(tparams)
 
     use_noise.set_value(0.)
     train_err = pred_error(f_pred, prepare_data, train, kf)
@@ -585,45 +559,25 @@ def train(dim_proj=100,
 
     print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
 
-    params = copy.copy(best_p)
-    numpy.savez(saveto, zipped_params=best_p, train_err=train_err,
+    numpy.savez(saveto, train_err=train_err,
                 valid_err=valid_err, test_err=test_err,
-                history_errs=history_errs, **params)
-
+                history_errs=history_errs, **best_p)
+    print 'The code run for %d epochs, with %f sec/epochs' % (
+        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
+    print >> sys.stderr, ('Training took %.1fs' %
+                          (end_time - start_time))
     return train_err, valid_err, test_err
 
 
-def main(job_id, params):
-    print ('Anything printed here will end up in the output directory'
-           'for job #%d' % job_id)
-    print params
-    use_dropout = True if params['use-dropout'][0] else False
-    trainerr, validerr, testerr = train(saveto=params['model'][0],
-                                        dim_proj=params['dim-proj'][0],
-                                        n_words=params['n-words'][0],
-                                        decay_c=params['decay-c'][0],
-                                        lrate=params['learning-rate'][0],
-                                        optimizer=params['optimizer'][0],
-                                        activ=params['activ'][0],
-                                        encoder=params['encoder'][0],
-                                        maxlen=600,
-                                        batch_size=16,
-                                        valid_batch_size=16,
-                                        validFreq=10000,
-                                        dispFreq=10,
-                                        saveFreq=100000,
-                                        dataset='imdb',
-                                        use_dropout=use_dropout)
-    return validerr
-
 if __name__ == '__main__':
-    main(0, {
-        'model': ['model_lstm.npz'],
-        'encoder': ['lstm'],
-        'dim-proj': [128],
-        'n-words': [10000],
-        'optimizer': ['adadelta'],
-        'activ': ['lambda x: tensor.tanh(x)'],
-        'decay-c': [0.],
-        'use-dropout': [1],
-        'learning-rate': [0.0001]})
+
+    # We must have floatX=float32 for this tutorial to work correctly.
+    theano.config.floatX = "float32"
+    # The next line is the new Theano default. This is a speed up.
+    theano.config.scan.allow_gc = False
+
+    # See function train for all possible parameter and there definition.
+    train_lstm(
+        #reload_model="lstm_model.npz",
+        max_epochs=100,
+    )
diff --git a/data/download.sh b/data/download.sh
index 8a8e9a92..88e48e5a 100755
--- a/data/download.sh
+++ b/data/download.sh
@@ -15,7 +15,7 @@ fi
 
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz
-$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz
+$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz && gunzip imdb.pkl.gz
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip
 $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
 $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold0.pkl.gz