From 1c08ba8f19f85a4f4fdb9febcdcaee0a2ac6858d Mon Sep 17 00:00:00 2001 From: Frederic Date: Thu, 8 Jan 2015 13:14:01 -0500 Subject: [PATCH 01/27] better loading of imdb dataset --- code/imdb.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/code/imdb.py b/code/imdb.py index f98c9601..0aaf641f 100644 --- a/code/imdb.py +++ b/code/imdb.py @@ -42,7 +42,33 @@ def prepare_data(seqs, labels, maxlen=None): return x, x_mask, labels -def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1): +def get_dataset_file(dataset, default_dataset, origin): + '''Look for it as if it was a full path, if not, try local file, + if not try in the data directory. + + Download dataset if it is not present + + ''' + data_dir, data_file = os.path.split(dataset) + if data_dir == "" and not os.path.isfile(dataset): + # Check if dataset is in the data directory. + new_path = os.path.join( + os.path.split(__file__)[0], + "..", + "data", + dataset + ) + if os.path.isfile(new_path) or data_file == default_dataset: + dataset = new_path + + if (not os.path.isfile(dataset)) and data_file == default_dataset: + import urllib + print 'Downloading data from %s' % origin + urllib.urlretrieve(origin, dataset) + return dataset + + +def load_data(path="imdb.pkl.gz", n_words=100000, valid_portion=0.1): ''' Loads the dataset :type dataset: string @@ -53,10 +79,12 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1): # LOAD DATA # ############# - print '... loading data' - # Load the dataset - f = open(path, 'rb') + path = get_dataset_file( + path, "imdb.pkl.gz", + "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz") + + f = gzip.open(path, 'rb') train_set = cPickle.load(f) test_set = cPickle.load(f) f.close() From 859e9c3c49a03b868ce5eb94ad6a0165ce574d40 Mon Sep 17 00:00:00 2001 From: Frederic Date: Thu, 8 Jan 2015 13:51:37 -0500 Subject: [PATCH 02/27] don't use eval anymore --- code/lstm.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 2b57d8be..24f1959a 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -107,14 +107,10 @@ def init_tparams(params): tparams[kk] = theano.shared(params[kk], name=kk) return tparams -layers = {'ff': ('param_init_fflayer', 'fflayer'), - 'rconv': ('param_init_rconv', 'rconv_layer'), - 'lstm': ('param_init_lstm', 'lstm_layer')} - def get_layer(name): fns = layers[name] - return (eval(fns[0]), eval(fns[1])) + return fns def param_init_fflayer(options, params, prefix='ff'): @@ -129,7 +125,7 @@ def param_init_fflayer(options, params, prefix='ff'): def fflayer(tparams, state_below, options, prefix='rconv', **kwargs): pre_act = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) - return eval(options['activ'])(pre_act) + return options['activ'](pre_act) def ortho_weight(ndim): @@ -229,7 +225,7 @@ def _step(m_, p_): ls_ = ps_ ps_ = tensor.dot(ps_, tparams[_p(prefix, 'U')]) pl_ = tensor.dot(p_, tparams[_p(prefix, 'W')]) - newact = eval(options['activ'])(ps_+pl_+tparams[_p(prefix, 'b')]) + newact = options['activ'](ps_+pl_+tparams[_p(prefix, 'b')]) # gater gt_ = (tensor.dot(ls_, tparams[_p(prefix, 'GU')]) + @@ -284,6 +280,11 @@ def _grab_root(seqlen, one_sample, prev_sample): return roots +layers = {'ff': (param_init_fflayer, fflayer), + 'rconv': (param_init_rconv, rconv_layer), + 'lstm': (param_init_lstm, lstm_layer)} + + def adadelta(lr, tparams, grads, x, mask, y, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) @@ -432,21 +433,21 @@ def train(dim_proj=100, patience=10, max_epochs=5000, dispFreq=100, - activ='lambda x: tensor.tanh(x)', + activ=tensor.tanh, decay_c=0., lrate=0.01, n_words=100000, data_sym=False, - optimizer='rmsprop', - encoder='rconv', - saveto='model.npz', + optimizer=rmsprop, + encoder='lstm', + saveto='lstm_model.npz', noise_std=0., validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates maxlen=50, batch_size=16, valid_batch_size=16, - dataset='sentiment140', + dataset='imdb', use_dropout=False): # Model options @@ -481,8 +482,8 @@ def train(dim_proj=100, f_grad = theano.function([x, mask, y], grads) lr = tensor.scalar(name='lr') - f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, - x, mask, y, cost) + f_grad_shared, f_update = optimizer(lr, tparams, grads, + x, mask, y, cost) print 'Optimization' @@ -618,12 +619,12 @@ def main(job_id, params): if __name__ == '__main__': main(0, { - 'model': ['model_lstm.npz'], + 'model': ['lstm_model.npz'], 'encoder': ['lstm'], 'dim-proj': [128], 'n-words': [10000], - 'optimizer': ['adadelta'], - 'activ': ['lambda x: tensor.tanh(x)'], + 'optimizer': [adadelta], # adadelta and rmsprop avail + 'activ': [tensor.tanh], # The activation function from Theano. 'decay-c': [0.], 'use-dropout': [1], 'learning-rate': [0.0001]}) From e57dd0371cd2e1d204660c807f8321b5892b6803 Mon Sep 17 00:00:00 2001 From: Frederic Date: Thu, 8 Jan 2015 13:52:11 -0500 Subject: [PATCH 03/27] force floatX=float32, as otherwise there is problem. The learning rate 0.005 get cast to float64 --- code/lstm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/lstm.py b/code/lstm.py index 24f1959a..ca35616f 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -618,6 +618,10 @@ def main(job_id, params): return validerr if __name__ == '__main__': + + # We must have floatX=float32 for this tutorial to work correctly. + theano.config.floatX = "float32" + main(0, { 'model': ['lstm_model.npz'], 'encoder': ['lstm'], From e85246f0899857f648e6eb46627b89a98391456f Mon Sep 17 00:00:00 2001 From: Frederic Date: Thu, 8 Jan 2015 15:18:29 -0500 Subject: [PATCH 04/27] lstm: add comments --- code/lstm.py | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index ca35616f..dedc0401 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -44,11 +44,17 @@ def get_dataset(name): def zipp(params, tparams): + """ + When we reload the model. Needed for the GPU stuff. + """ for kk, vv in params.iteritems(): tparams[kk].set_value(vv) def unzip(zipped): + """ + When we pickle the model. Needed for the GPU stuff. + """ new_params = OrderedDict() for kk, vv in zipped.iteritems(): new_params[kk] = vv.get_value() @@ -79,7 +85,6 @@ def init_params(options): randn = numpy.random.rand(options['n_words'], options['dim_proj']) params['Wemb'] = (0.01 * randn).astype('float32') - # rconv params = get_layer(options['encoder'])[0](options, params, prefix=options['encoder']) @@ -122,7 +127,7 @@ def param_init_fflayer(options, params, prefix='ff'): return params -def fflayer(tparams, state_below, options, prefix='rconv', **kwargs): +def fflayer(tparams, state_below, options, prefix='ff', **kwargs): pre_act = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) return options['activ'](pre_act) @@ -396,6 +401,9 @@ def build_model(tparams, options): def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): + """ If you want to use a trained model, this is useful to compute + the probabilities of new examples. + """ n_samples = len(data[0]) probs = numpy.zeros((n_samples, 2)).astype('float32') @@ -416,6 +424,11 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): def pred_error(f_pred, prepare_data, data, iterator, verbose=False): + """ + Just compute the error + f_pred: Theano fct computing the prediction + prepare_data: usual prepare_data for that dataset. + """ valid_err = 0 for _, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], @@ -430,19 +443,18 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False): def train(dim_proj=100, - patience=10, + patience=10, # number of epoch to wait before early stop if no progress max_epochs=5000, - dispFreq=100, + dispFreq=100, # display to stdout the training progress every N updates activ=tensor.tanh, - decay_c=0., - lrate=0.01, - n_words=100000, - data_sym=False, - optimizer=rmsprop, - encoder='lstm', + decay_c=0., # weight decay for the classifier + lrate=0.01, # learning rate for sgd (not used for adadelta and rmsprop) + n_words=100000, # wocabulary size + optimizer=adadelta, + encoder='lstm',# can be removed must be lstm. saveto='lstm_model.npz', noise_std=0., - validFreq=1000, + validFreq=1000, # after 1000 saveFreq=1000, # save the parameters after every saveFreq updates maxlen=50, batch_size=16, @@ -478,7 +490,7 @@ def train(dim_proj=100, f_cost = theano.function([x, mask, y], cost) - grads = tensor.grad(cost, wrt=itemlist(tparams)) + grads = tensor.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, mask, y], grads) lr = tensor.scalar(name='lr') @@ -627,8 +639,8 @@ def main(job_id, params): 'encoder': ['lstm'], 'dim-proj': [128], 'n-words': [10000], - 'optimizer': [adadelta], # adadelta and rmsprop avail + 'optimizer': [adadelta], # sgd, adadelta and rmsprop available 'activ': [tensor.tanh], # The activation function from Theano. - 'decay-c': [0.], - 'use-dropout': [1], + 'decay-c': [0.], # + 'use-dropout': [1], # if disable slightly faster, but worst test error. 'learning-rate': [0.0001]}) From 4eeee9825aa12a42f2f60c29699890668397c774 Mon Sep 17 00:00:00 2001 From: Frederic Date: Thu, 8 Jan 2015 15:18:57 -0500 Subject: [PATCH 05/27] lstm: remove rconv code --- code/lstm.py | 85 ++-------------------------------------------------- 1 file changed, 2 insertions(+), 83 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index dedc0401..181b3578 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -203,90 +203,9 @@ def _step(m_, x_, h_, c_): return rval[0] -def param_init_rconv(options, params, prefix='rconv'): - params[_p(prefix, 'W')] = ortho_weight(options['dim_proj']) - params[_p(prefix, 'U')] = ortho_weight(options['dim_proj']) - b = numpy.zeros((options['dim_proj'],)).astype('float32') - params[_p(prefix, 'b')] = b - gw = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32') - params[_p(prefix, 'GW')] = gw - gu = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32') - params[_p(prefix, 'GU')] = gu - params[_p(prefix, 'Gb')] = numpy.zeros((3,)).astype('float32') - - return params - - -def rconv_layer(tparams, state_below, options, prefix='rconv', mask=None): - nsteps = state_below.shape[0] - - assert mask is not None - - def _step(m_, p_): - l_ = p_ - # new activation - ps_ = tensor.zeros_like(p_) - ps_ = tensor.set_subtensor(ps_[1:], p_[:-1]) - ls_ = ps_ - ps_ = tensor.dot(ps_, tparams[_p(prefix, 'U')]) - pl_ = tensor.dot(p_, tparams[_p(prefix, 'W')]) - newact = options['activ'](ps_+pl_+tparams[_p(prefix, 'b')]) - - # gater - gt_ = (tensor.dot(ls_, tparams[_p(prefix, 'GU')]) + - tensor.dot(l_, tparams[_p(prefix, 'GW')]) + - tparams[_p(prefix, 'Gb')]) - if l_.ndim == 3: - gt_shp = gt_.shape - gt_ = gt_.reshape((gt_shp[0] * gt_shp[1], gt_shp[2])) - gt_ = tensor.nnet.softmax(gt_) - if l_.ndim == 3: - gt_ = gt_.reshape((gt_shp[0], gt_shp[1], gt_shp[2])) - - if p_.ndim == 3: - gn = gt_[:, :, 0].dimshuffle(0, 1, 'x') - gl = gt_[:, :, 1].dimshuffle(0, 1, 'x') - gr = gt_[:, :, 2].dimshuffle(0, 1, 'x') - else: - gn = gt_[:, 0].dimshuffle(0, 'x') - gl = gt_[:, 1].dimshuffle(0, 'x') - gr = gt_[:, 2].dimshuffle(0, 'x') - - act = newact * gn + ls_ * gl + l_ * gr - - if p_.ndim == 3: - m_ = m_.dimshuffle('x', 0, 'x') - else: - m_ = m_.dimshuffle('x', 0) - return tensor.switch(m_, act, l_) - - rval, updates = theano.scan(_step, - sequences=[mask[1:]], - outputs_info=[state_below], - name='layer_%s' % prefix, - n_steps=nsteps-1) - - seqlens = tensor.cast(mask.sum(axis=0), 'int64')-1 - roots = rval[-1] - - if state_below.ndim == 3: - def _grab_root(seqlen, one_sample, prev_sample): - return one_sample[seqlen] - - dim_proj = options['dim_proj'] - roots, updates = theano.scan(_grab_root, - sequences=[seqlens, - roots.dimshuffle(1, 0, 2)], - outputs_info=[tensor.alloc(0., dim_proj)], - name='grab_root_%s' % prefix) - else: - roots = roots[seqlens] # there should be only one, so it's fine. - - return roots - - +# ff: Feed Forward (normal neural net), only useful to put after lstm +# before the classifier. layers = {'ff': (param_init_fflayer, fflayer), - 'rconv': (param_init_rconv, rconv_layer), 'lstm': (param_init_lstm, lstm_layer)} From 6fe4fa020fc7236ae8c4c69f2d154198ae048591 Mon Sep 17 00:00:00 2001 From: Frederic Date: Thu, 8 Jan 2015 16:38:19 -0500 Subject: [PATCH 06/27] Code simplification. --- code/lstm.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 181b3578..f0361512 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -61,10 +61,6 @@ def unzip(zipped): return new_params -def itemlist(tparams): - return [vv for kk, vv in tparams.iteritems()] - - def dropout_layer(state_before, use_noise, trng): proj = tensor.switch(use_noise, (state_before * @@ -232,7 +228,7 @@ def adadelta(lr, tparams, grads, x, mask, y, cost): running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] - param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] + param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore') @@ -266,7 +262,7 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost): for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) - for p, udn in zip(itemlist(tparams), updir_new)] + for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new+param_up, on_unused_input='ignore') @@ -280,7 +276,7 @@ def sgd(lr, tparams, grads, x, mask, y, cost): f_grad_shared = theano.function([x, mask, y], cost, updates=gsup) - pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] + pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] f_update = theano.function([lr], [], updates=pup) return f_grad_shared, f_update From 6b7d587a38c6ada08266ad51dcc336d819ffc0d9 Mon Sep 17 00:00:00 2001 From: Frederic Date: Fri, 9 Jan 2015 12:00:37 -0500 Subject: [PATCH 07/27] Fix typo, add docstring, add timming, remove useless printing --- code/imdb.py | 9 +++++++++ code/lstm.py | 29 ++++++++++++++++++----------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/code/imdb.py b/code/imdb.py index 0aaf641f..73e2d7b7 100644 --- a/code/imdb.py +++ b/code/imdb.py @@ -11,6 +11,15 @@ def prepare_data(seqs, labels, maxlen=None): + """Create the matrices from the datasets. + + This pad each sequence to the same lenght: the lenght of the + longuest sequence or maxlen. + + if maxlen is set, we will cut all sequence to this maximum + lenght. + + """ # x: a list of sentences lengths = [len(s) for s in seqs] diff --git a/code/lstm.py b/code/lstm.py index f0361512..e3c21f5e 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -1,16 +1,17 @@ ''' Build a tweet sentiment analyzer ''' -import theano -import theano.tensor as tensor -from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams - -import cPickle as pkl -import numpy +from collections import OrderedDict import copy +import cPickle as pkl import random +import sys +import time -from collections import OrderedDict +import numpy +import theano +import theano.tensor as tensor +from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import imdb @@ -364,7 +365,7 @@ def train(dim_proj=100, activ=tensor.tanh, decay_c=0., # weight decay for the classifier lrate=0.01, # learning rate for sgd (not used for adadelta and rmsprop) - n_words=100000, # wocabulary size + n_words=100000, # vocabulary size optimizer=adadelta, encoder='lstm',# can be removed must be lstm. saveto='lstm_model.npz', @@ -432,6 +433,7 @@ def train(dim_proj=100, uidx = 0 estop = False + start_time = time.clock() for eidx in xrange(max_epochs): n_samples = 0 @@ -502,9 +504,11 @@ def train(dim_proj=100, if estop: break - + end_time = time.clock() if best_p is not None: zipp(best_p, tparams) + else: + best_p = unzip(tparams) use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) @@ -518,12 +522,15 @@ def train(dim_proj=100, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) + print 'The code run for %d epochs, with %f epochs/sec' % ( + uidx, 1. * uidx / (end_time - start_time)) + print >> sys.stderr, ('The code for file ' + + os.path.split(__file__)[1] + + ' ran for %.1fs' % ((end_time - start_time))) return train_err, valid_err, test_err def main(job_id, params): - print ('Anything printed here will end up in the output directory' - 'for job #%d' % job_id) print params use_dropout = True if params['use-dropout'][0] else False trainerr, validerr, testerr = train(saveto=params['model'][0], From 1d12bee18115e0a150d2ed92780cb3884b42ff88 Mon Sep 17 00:00:00 2001 From: Frederic Date: Fri, 9 Jan 2015 13:38:08 -0500 Subject: [PATCH 08/27] use the not compressed version of imdb. This take 1s to load instead of 45s --- code/imdb.py | 10 +++++++--- code/lstm.py | 2 +- data/download.sh | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/code/imdb.py b/code/imdb.py index 73e2d7b7..1bcc83bb 100644 --- a/code/imdb.py +++ b/code/imdb.py @@ -77,7 +77,7 @@ def get_dataset_file(dataset, default_dataset, origin): return dataset -def load_data(path="imdb.pkl.gz", n_words=100000, valid_portion=0.1): +def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1): ''' Loads the dataset :type dataset: string @@ -91,9 +91,13 @@ def load_data(path="imdb.pkl.gz", n_words=100000, valid_portion=0.1): # Load the dataset path = get_dataset_file( path, "imdb.pkl.gz", - "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz") + "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") + + if path.endswith(".gz"): + f = gzip.open(path, 'rb') + else: + f = open(path, 'rb') - f = gzip.open(path, 'rb') train_set = cPickle.load(f) test_set = cPickle.load(f) f.close() diff --git a/code/lstm.py b/code/lstm.py index e3c21f5e..c5e2bd98 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -471,7 +471,7 @@ def train(dim_proj=100, else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) - pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) + pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' if numpy.mod(uidx, validFreq) == 0: diff --git a/data/download.sh b/data/download.sh index 8a8e9a92..88e48e5a 100755 --- a/data/download.sh +++ b/data/download.sh @@ -15,7 +15,7 @@ fi $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz -$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz +$DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz && gunzip imdb.pkl.gz $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)" $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold0.pkl.gz From f96d201b49a9cc4ff39a32531f1dc186abd6e9b1 Mon Sep 17 00:00:00 2001 From: Frederic Date: Fri, 9 Jan 2015 16:41:24 -0500 Subject: [PATCH 09/27] remove import not used --- code/imdb.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/code/imdb.py b/code/imdb.py index 1bcc83bb..c33884d6 100644 --- a/code/imdb.py +++ b/code/imdb.py @@ -1,13 +1,10 @@ import cPickle import gzip import os -import sys -import time import numpy import theano -import theano.tensor as T def prepare_data(seqs, labels, maxlen=None): From 9942cb826b59773549fa9f36be291ddc94facb1b Mon Sep 17 00:00:00 2001 From: Frederic Date: Fri, 9 Jan 2015 16:42:17 -0500 Subject: [PATCH 10/27] add name to fct --- code/lstm.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index c5e2bd98..23887e06 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -221,7 +221,8 @@ def adadelta(lr, tparams, grads, x, mask, y, cost): rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] - f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up) + f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up, + name='adadelta_f_grad_shared') updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, @@ -232,7 +233,8 @@ def adadelta(lr, tparams, grads, x, mask, y, cost): param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, - on_unused_input='ignore') + on_unused_input='ignore', + name='adadelta_f_update') return f_grad_shared, f_update @@ -254,7 +256,8 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost): for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, - updates=zgup + rgup + rg2up) + updates=zgup + rgup + rg2up, + name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir' % k) @@ -265,7 +268,8 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost): param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new+param_up, - on_unused_input='ignore') + on_unused_input='ignore', + name='rmsprop_f_update') return f_grad_shared, f_update @@ -275,10 +279,12 @@ def sgd(lr, tparams, grads, x, mask, y, cost): for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] - f_grad_shared = theano.function([x, mask, y], cost, updates=gsup) + f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, + name='sgd_f_grad_shared') pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] - f_update = theano.function([lr], [], updates=pup) + f_update = theano.function([lr], [], updates=pup, + name='sgd_f_update') return f_grad_shared, f_update @@ -308,8 +314,8 @@ def build_model(tparams, options): pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b']) - f_pred_prob = theano.function([x, mask], pred) - f_pred = theano.function([x, mask], pred.argmax(axis=1)) + f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') + f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean() @@ -404,10 +410,10 @@ def train(dim_proj=100, weight_decay *= decay_c cost += weight_decay - f_cost = theano.function([x, mask, y], cost) + f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=tparams.values()) - f_grad = theano.function([x, mask, y], grads) + f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, From 1e6bce295f5a90df5e22cb7168c6b308cb1b7a34 Mon Sep 17 00:00:00 2001 From: Frederic Date: Fri, 9 Jan 2015 16:43:55 -0500 Subject: [PATCH 11/27] add comment --- code/lstm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 23887e06..3d43e3ca 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -368,11 +368,11 @@ def train(dim_proj=100, patience=10, # number of epoch to wait before early stop if no progress max_epochs=5000, dispFreq=100, # display to stdout the training progress every N updates - activ=tensor.tanh, + activ=tensor.tanh, # The activation function from Theano. decay_c=0., # weight decay for the classifier lrate=0.01, # learning rate for sgd (not used for adadelta and rmsprop) n_words=100000, # vocabulary size - optimizer=adadelta, + optimizer=adadelta, # sgd, adadelta and rmsprop available encoder='lstm',# can be removed must be lstm. saveto='lstm_model.npz', noise_std=0., @@ -382,7 +382,8 @@ def train(dim_proj=100, batch_size=16, valid_batch_size=16, dataset='imdb', - use_dropout=False): + use_dropout=False, # if False slightly faster, but worst test error + ): # Model options model_options = locals().copy() From 1b17e874e2e6eaa31344e48c09456303cd9a159a Mon Sep 17 00:00:00 2001 From: Frederic Date: Fri, 9 Jan 2015 16:45:27 -0500 Subject: [PATCH 12/27] pep8 printing --- code/lstm.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 3d43e3ca..48d012e9 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -372,11 +372,11 @@ def train(dim_proj=100, decay_c=0., # weight decay for the classifier lrate=0.01, # learning rate for sgd (not used for adadelta and rmsprop) n_words=100000, # vocabulary size - optimizer=adadelta, # sgd, adadelta and rmsprop available - encoder='lstm',# can be removed must be lstm. + optimizer=adadelta, # sgd, adadelta and rmsprop available + encoder='lstm', # can be removed must be lstm. saveto='lstm_model.npz', noise_std=0., - validFreq=1000, # after 1000 + validFreq=1000, # after 1000 saveFreq=1000, # save the parameters after every saveFreq updates maxlen=50, batch_size=16, @@ -529,11 +529,10 @@ def train(dim_proj=100, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) - print 'The code run for %d epochs, with %f epochs/sec' % ( - uidx, 1. * uidx / (end_time - start_time)) - print >> sys.stderr, ('The code for file ' + - os.path.split(__file__)[1] + - ' ran for %.1fs' % ((end_time - start_time))) + print 'The code run for %d epochs, with %f sec/epochs' % ( + (eidx + 1), 1. * (eidx + 1) / (end_time - start_time)) + print >> sys.stderr, ('Training took %.1fs minutes' % + (end_time - start_time)) return train_err, valid_err, test_err From 64eeb12543a88f0f1ab2ea4abd4c584e562efce1 Mon Sep 17 00:00:00 2001 From: Frederic Date: Fri, 9 Jan 2015 16:48:42 -0500 Subject: [PATCH 13/27] code simplification --- code/lstm.py | 60 +++++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 48d012e9..df3e380e 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -374,16 +374,16 @@ def train(dim_proj=100, n_words=100000, # vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available encoder='lstm', # can be removed must be lstm. - saveto='lstm_model.npz', + saveto='lstm_model.npz', # The best model will be saved there noise_std=0., validFreq=1000, # after 1000 saveFreq=1000, # save the parameters after every saveFreq updates - maxlen=50, + maxlen=50, # longer sequence get ignored batch_size=16, valid_batch_size=16, dataset='imdb', use_dropout=False, # if False slightly faster, but worst test error - ): +): # Model options model_options = locals().copy() @@ -536,39 +536,31 @@ def train(dim_proj=100, return train_err, valid_err, test_err -def main(job_id, params): - print params - use_dropout = True if params['use-dropout'][0] else False - trainerr, validerr, testerr = train(saveto=params['model'][0], - dim_proj=params['dim-proj'][0], - n_words=params['n-words'][0], - decay_c=params['decay-c'][0], - lrate=params['learning-rate'][0], - optimizer=params['optimizer'][0], - activ=params['activ'][0], - encoder=params['encoder'][0], - maxlen=600, - batch_size=16, - valid_batch_size=16, - validFreq=10000, - dispFreq=10, - saveFreq=100000, - dataset='imdb', - use_dropout=use_dropout) - return validerr - if __name__ == '__main__': # We must have floatX=float32 for this tutorial to work correctly. theano.config.floatX = "float32" + theano.config.scan.allow_gc = False + + # See function train for all possible parameter and there definition. + trainerr, validerr, testerr = train( + saveto='lstm_model.npz', # The best model will be saved there + dim_proj=128, + n_words=10000, + decay_c=0, + lrate=0.0001, + optimizer=sgd, + activ=tensor.tanh, + encoder='lstm', + maxlen=100, # longer get ignored + batch_size=64, + valid_batch_size=64, + validFreq=10000, + dispFreq=10, + saveFreq=100000, + dataset='imdb', + use_dropout=True, + + max_epochs=2, + ) - main(0, { - 'model': ['lstm_model.npz'], - 'encoder': ['lstm'], - 'dim-proj': [128], - 'n-words': [10000], - 'optimizer': [adadelta], # sgd, adadelta and rmsprop available - 'activ': [tensor.tanh], # The activation function from Theano. - 'decay-c': [0.], # - 'use-dropout': [1], # if disable slightly faster, but worst test error. - 'learning-rate': [0.0001]}) From 86e5c4b49e5b6c9e2134fc1959339a2f314cfca3 Mon Sep 17 00:00:00 2001 From: Frederic Date: Sat, 10 Jan 2015 15:04:36 -0500 Subject: [PATCH 14/27] move sgd and comments --- code/lstm.py | 118 ++++++++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index df3e380e..2bb845bb 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -206,6 +206,34 @@ def _step(m_, x_, h_, c_): 'lstm': (param_init_lstm, lstm_layer)} +def sgd(lr, tparams, grads, x, mask, y, cost): + """ Stochastic Gradient Descent + + :note: A more complicated version of sgd then needed. This is + done like that for adadelta and rmsprop. + + """ + # New set of shared variable that will contain the gradient + # for a mini-batch. + gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) + for k, p in tparams.iteritems()] + gsup = [(gs, g) for gs, g in zip(gshared, grads)] + + # Function that computes gradients for a mini-batch, but do not + # updates the weights. + f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, + name='sgd_f_grad_shared') + + pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] + + # Function that updates the weights from the previously computed + # gradient. + f_update = theano.function([lr], [], updates=pup, + name='sgd_f_update') + + return f_grad_shared, f_update + + def adadelta(lr, tparams, grads, x, mask, y, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) @@ -274,21 +302,6 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost): return f_grad_shared, f_update -def sgd(lr, tparams, grads, x, mask, y, cost): - gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) - for k, p in tparams.iteritems()] - gsup = [(gs, g) for gs, g in zip(gshared, grads)] - - f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, - name='sgd_f_grad_shared') - - pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] - f_update = theano.function([lr], [], updates=pup, - name='sgd_f_update') - - return f_grad_shared, f_update - - def build_model(tparams, options): trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) @@ -319,7 +332,7 @@ def build_model(tparams, options): cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean() - return trng, use_noise, x, mask, y, f_pred_prob, f_pred, cost + return use_noise, x, mask, y, f_pred_prob, f_pred, cost def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False): @@ -364,25 +377,29 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False): return valid_err -def train(dim_proj=100, - patience=10, # number of epoch to wait before early stop if no progress - max_epochs=5000, - dispFreq=100, # display to stdout the training progress every N updates - activ=tensor.tanh, # The activation function from Theano. - decay_c=0., # weight decay for the classifier - lrate=0.01, # learning rate for sgd (not used for adadelta and rmsprop) - n_words=100000, # vocabulary size - optimizer=adadelta, # sgd, adadelta and rmsprop available - encoder='lstm', # can be removed must be lstm. - saveto='lstm_model.npz', # The best model will be saved there - noise_std=0., - validFreq=1000, # after 1000 - saveFreq=1000, # save the parameters after every saveFreq updates - maxlen=50, # longer sequence get ignored - batch_size=16, - valid_batch_size=16, - dataset='imdb', - use_dropout=False, # if False slightly faster, but worst test error +def test_lstm( + dim_proj=128, # TODO: What is this + patience=10, # number of epoch to wait before early stop if no progress + max_epochs=5000, # The maximum number of epoch to run + dispFreq=10, # display to stdout the training progress every N updates + activ=tensor.tanh, # The activation function from Theano. + decay_c=0., # weight decay for the classifier applied to the U weights. + lrate=0.0001, # learning rate for sgd (not used for adadelta and rmsprop) + n_words=10000, # vocabulary size + optimizer=sgd, # sgd, adadelta and rmsprop available + encoder='lstm', # TODO: can be removed must be lstm. + saveto='lstm_model.npz', # The best model will be saved there + validFreq=10000, # after 1000 + saveFreq=100000, # save the parameters after every saveFreq updates + maxlen=100, # longer sequence get ignored + batch_size=64, + valid_batch_size=64, + dataset='imdb', + + # Parameter for extra option + noise_std=0., + use_dropout=False, # if False slightly faster, but worst test error + # TODO: This frequently need a bigger model. ): # Model options @@ -398,10 +415,17 @@ def train(dim_proj=100, model_options['ydim'] = ydim print 'Building model' + # This create the initial parameters as numpy ndarrays. + # Dict name (string) -> numpy ndarray params = init_params(model_options) + + # This create Theano Shared Variable from the parameters. + # Dict name (string) -> Theano Tensor Shared Variable + # params and tparams have different copy of the weights. tparams = init_tparams(params) - (trng, use_noise, x, mask, + # use_noise is for dropout + (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: @@ -543,24 +567,4 @@ def train(dim_proj=100, theano.config.scan.allow_gc = False # See function train for all possible parameter and there definition. - trainerr, validerr, testerr = train( - saveto='lstm_model.npz', # The best model will be saved there - dim_proj=128, - n_words=10000, - decay_c=0, - lrate=0.0001, - optimizer=sgd, - activ=tensor.tanh, - encoder='lstm', - maxlen=100, # longer get ignored - batch_size=64, - valid_batch_size=64, - validFreq=10000, - dispFreq=10, - saveFreq=100000, - dataset='imdb', - use_dropout=True, - - max_epochs=2, - ) - + test_lstm(max_epochs=10) From c480d4eb1523e014ea8bb4579565fffdbd0b0583 Mon Sep 17 00:00:00 2001 From: Frederic Date: Sat, 10 Jan 2015 15:07:20 -0500 Subject: [PATCH 15/27] remove fflayers --- code/lstm.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 2bb845bb..c7f0c85f 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -115,21 +115,6 @@ def get_layer(name): return fns -def param_init_fflayer(options, params, prefix='ff'): - weights = numpy.random.randn(options['dim_proj'], options['dim_proj']) - biases = numpy.zeros((options['dim_proj'], )) - params[_p(prefix, 'W')] = 0.01 * weights.astype('float32') - params[_p(prefix, 'b')] = biases.astype('float32') - - return params - - -def fflayer(tparams, state_below, options, prefix='ff', **kwargs): - pre_act = (tensor.dot(state_below, - tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) - return options['activ'](pre_act) - - def ortho_weight(ndim): W = numpy.random.randn(ndim, ndim) u, s, v = numpy.linalg.svd(W) @@ -202,8 +187,7 @@ def _step(m_, x_, h_, c_): # ff: Feed Forward (normal neural net), only useful to put after lstm # before the classifier. -layers = {'ff': (param_init_fflayer, fflayer), - 'lstm': (param_init_lstm, lstm_layer)} +layers = {'lstm': (param_init_lstm, lstm_layer)} def sgd(lr, tparams, grads, x, mask, y, cost): @@ -382,7 +366,6 @@ def test_lstm( patience=10, # number of epoch to wait before early stop if no progress max_epochs=5000, # The maximum number of epoch to run dispFreq=10, # display to stdout the training progress every N updates - activ=tensor.tanh, # The activation function from Theano. decay_c=0., # weight decay for the classifier applied to the U weights. lrate=0.0001, # learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # vocabulary size From 2e022a636c62ed7b8f6f536124a22d791e4a179b Mon Sep 17 00:00:00 2001 From: Frederic Date: Mon, 12 Jan 2015 11:18:46 -0500 Subject: [PATCH 16/27] small fixes and doc --- code/lstm.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index c7f0c85f..d23c6e76 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -19,6 +19,9 @@ def get_minibatches_idx(n, nb_batches, shuffle=False): + """ + Used to shuffle the dataset at each iteration. + """ idx_list = numpy.arange(n, dtype="int32") @@ -381,8 +384,8 @@ def test_lstm( # Parameter for extra option noise_std=0., - use_dropout=False, # if False slightly faster, but worst test error - # TODO: This frequently need a bigger model. + use_dropout=True, # if False slightly faster, but worst test error + # This frequently need a bigger model. ): # Model options @@ -502,6 +505,10 @@ def test_lstm( best_p = unzip(tparams) bad_counter = 0 + + print ('Train ', train_err, 'Valid ', valid_err, + 'Test ', test_err) + if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): @@ -511,9 +518,6 @@ def test_lstm( estop = True break - print ('Train ', train_err, 'Valid ', valid_err, - 'Test ', test_err) - print 'Seen %d samples' % n_samples if estop: @@ -537,8 +541,8 @@ def test_lstm( history_errs=history_errs, **params) print 'The code run for %d epochs, with %f sec/epochs' % ( - (eidx + 1), 1. * (eidx + 1) / (end_time - start_time)) - print >> sys.stderr, ('Training took %.1fs minutes' % + (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) + print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err @@ -547,6 +551,7 @@ def test_lstm( # We must have floatX=float32 for this tutorial to work correctly. theano.config.floatX = "float32" + # The next line is the new Theano default. This is a speed up. theano.config.scan.allow_gc = False # See function train for all possible parameter and there definition. From 8afe749dd5373b7378170f960215ee1414faaec7 Mon Sep 17 00:00:00 2001 From: Frederic Date: Mon, 12 Jan 2015 15:40:58 -0500 Subject: [PATCH 17/27] small update --- code/lstm.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index d23c6e76..200b359f 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -291,6 +291,8 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost): def build_model(tparams, options): trng = RandomStreams(1234) + + # Used for dropout. use_noise = theano.shared(numpy.float32(0.)) x = tensor.matrix('x', dtype='int64') @@ -378,8 +380,8 @@ def test_lstm( validFreq=10000, # after 1000 saveFreq=100000, # save the parameters after every saveFreq updates maxlen=100, # longer sequence get ignored - batch_size=64, - valid_batch_size=64, + batch_size=64, # the batch size during training. + valid_batch_size=64, # The batch size during validation dataset='imdb', # Parameter for extra option @@ -448,12 +450,13 @@ def test_lstm( if saveFreq == -1: saveFreq = len(train[0])/batch_size - uidx = 0 - estop = False + uidx = 0 # the number of update done + estop = False # early stop start_time = time.clock() for eidx in xrange(max_epochs): n_samples = 0 + # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), len(train[0])/batch_size, shuffle=True) @@ -462,10 +465,13 @@ def test_lstm( uidx += 1 use_noise.set_value(1.) + # Select the random examples for this minibatch y = [train[1][t] for t in train_index] - x, mask, y = prepare_data([train[0][t]for t in train_index], - y, maxlen=maxlen) + x = [train[0][t]for t in train_index] + # Get the data in numpy.ndarray formet. + # It return something of the shape (minibatch maxlen, n samples) + x, mask, y = prepare_data(x, y, maxlen=maxlen) if x is None: print 'Minibatch with zero sample under length ', maxlen continue From 3d9b1ac652a9e26fefd2219644c3cca94cbed42d Mon Sep 17 00:00:00 2001 From: Frederic Date: Mon, 12 Jan 2015 15:42:08 -0500 Subject: [PATCH 18/27] fix the display of the number of example seen --- code/lstm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/lstm.py b/code/lstm.py index 200b359f..e91fbd84 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -461,7 +461,6 @@ def test_lstm( shuffle=True) for _, train_index in kf: - n_samples += train_index.shape[0] uidx += 1 use_noise.set_value(1.) @@ -475,6 +474,7 @@ def test_lstm( if x is None: print 'Minibatch with zero sample under length ', maxlen continue + n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) From 401a99a00ba77c70bdb5ff04e665a7f9978622c9 Mon Sep 17 00:00:00 2001 From: Frederic Date: Tue, 13 Jan 2015 08:40:34 -0500 Subject: [PATCH 19/27] Add a way to reload pretrained model --- code/lstm.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/code/lstm.py b/code/lstm.py index e91fbd84..8a7e42fb 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -80,6 +80,9 @@ def _p(pp, name): def init_params(options): + """ + Global (not LSTM) parameter. For the embeding and the classifier. + """ params = OrderedDict() # embedding randn = numpy.random.rand(options['n_words'], @@ -125,6 +128,11 @@ def ortho_weight(ndim): def param_init_lstm(options, params, prefix='lstm'): + """ + Init the LSTM parameter: + + :see: init_params + """ W = numpy.concatenate([ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), @@ -388,6 +396,7 @@ def test_lstm( noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. + reload_model="", # Path to a saved model we want to start from. ): # Model options @@ -407,6 +416,9 @@ def test_lstm( # Dict name (string) -> numpy ndarray params = init_params(model_options) + if reload_model: + load_params('lstm_model.npz', params) + # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. @@ -561,4 +573,7 @@ def test_lstm( theano.config.scan.allow_gc = False # See function train for all possible parameter and there definition. - test_lstm(max_epochs=10) + test_lstm( + #reload_model="lstm_model.npz", + max_epochs=10, + ) From 6b7b7a6cafdd56bd541a3fc14b9fac10a0380600 Mon Sep 17 00:00:00 2001 From: Frederic Date: Tue, 13 Jan 2015 08:41:35 -0500 Subject: [PATCH 20/27] use adadelta, sgd do not work. --- code/lstm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 8a7e42fb..ba733c60 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -375,14 +375,14 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False): def test_lstm( - dim_proj=128, # TODO: What is this + dim_proj=128, # word embeding dimension and LSTM number of hidden units. patience=10, # number of epoch to wait before early stop if no progress max_epochs=5000, # The maximum number of epoch to run dispFreq=10, # display to stdout the training progress every N updates decay_c=0., # weight decay for the classifier applied to the U weights. lrate=0.0001, # learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # vocabulary size - optimizer=sgd, # sgd, adadelta and rmsprop available + optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decay learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there validFreq=10000, # after 1000 From 194adadb94c5571c12b5ad7de903612f1ff26968 Mon Sep 17 00:00:00 2001 From: Frederic Date: Tue, 13 Jan 2015 09:29:05 -0500 Subject: [PATCH 21/27] Add the script that created the preprocessed imdb dataset --- code/imdb_preprocess.py | 123 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 code/imdb_preprocess.py diff --git a/code/imdb_preprocess.py b/code/imdb_preprocess.py new file mode 100644 index 00000000..c20b37b6 --- /dev/null +++ b/code/imdb_preprocess.py @@ -0,0 +1,123 @@ +""" +This script is what created the dataset pickled. + +1) You need to download this file and put it in the same directory as this file. +https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission. + +2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory. + +3) Then run this script. +""" + +dataset_path='/Tmp/bastienf/aclImdb/' + +import numpy +import cPickle as pkl + +from collections import OrderedDict + +import glob +import os + +from subprocess import Popen, PIPE + +# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer +tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-'] + + +def tokenize(sentences): + + print 'Tokenizing..', + text = "\n".join(sentences) + tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) + tok_text, _ = tokenizer.communicate(text) + toks = tok_text.split('\n')[:-1] + print 'Done' + + return toks + + +def build_dict(path): + sentences = [] + currdir = os.getcwd() + os.chdir('%s/pos/' % path) + for ff in glob.glob("*.txt"): + with open(ff, 'r') as f: + sentences.append(f.readline().strip()) + os.chdir('%s/neg/' % path) + for ff in glob.glob("*.txt"): + with open(ff, 'r') as f: + sentences.append(f.readline().strip()) + os.chdir(currdir) + + sentences = tokenize(sentences) + + print 'Building dictionary..', + wordcount = dict() + for ss in sentences: + words = ss.strip().lower().split() + for w in words: + if w not in wordcount: + wordcount[w] = 1 + else: + wordcount[w] += 1 + + counts = wordcount.values() + keys = wordcount.keys() + + sorted_idx = numpy.argsort(counts)[::-1] + + worddict = dict() + + for idx, ss in enumerate(sorted_idx): + worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK) + + print numpy.sum(counts), ' total words ', len(keys), ' unique words' + + return worddict + + +def grab_data(path, dictionary): + sentences = [] + currdir = os.getcwd() + os.chdir(path) + for ff in glob.glob("*.txt"): + with open(ff, 'r') as f: + sentences.append(f.readline().strip()) + os.chdir(currdir) + sentences = tokenize(sentences) + + seqs = [None] * len(sentences) + for idx, ss in enumerate(sentences): + words = ss.strip().lower().split() + seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words] + + return seqs + + +def main(): + # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ + path = dataset_path + dictionary = build_dict(os.path.join(path, 'train')) + + train_x_pos = grab_data(path+'train/pos', dictionary) + train_x_neg = grab_data(path+'train/neg', dictionary) + train_x = train_x_pos + train_x_neg + train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg) + + test_x_pos = grab_data(path+'test/pos', dictionary) + test_x_neg = grab_data(path+'test/neg', dictionary) + test_x = test_x_pos + test_x_neg + test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg) + + f = open('imdb.pkl', 'wb') + pkl.dump((train_x, train_y), f, -1) + pkl.dump((test_x, test_y), f, -1) + f.close() + + f = open('imdb.dict.pkl', 'wb') + pkl.dump(dictionary, f, -1) + f.close() + +if __name__ == '__main__': + main() From c6fdcff288103b707f640c3f19f51b50eb5ea9ab Mon Sep 17 00:00:00 2001 From: Pierre Luc Carrier Date: Tue, 13 Jan 2015 11:10:46 -0500 Subject: [PATCH 22/27] Fixed function get_minibatches_idx() --- code/lstm.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index ba733c60..1946c0ad 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -18,7 +18,7 @@ datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} -def get_minibatches_idx(n, nb_batches, shuffle=False): +def get_minibatches_idx(n, minibatch_size, shuffle=False): """ Used to shuffle the dataset at each iteration. """ @@ -30,17 +30,16 @@ def get_minibatches_idx(n, nb_batches, shuffle=False): minibatches = [] minibatch_start = 0 - for i in range(nb_batches): - if i < n % nb_batches: - minibatch_size = n // nb_batches + 1 - else: - minibatch_size = n // nb_batches - + for i in range(n // minibatch_size): minibatches.append(idx_list[minibatch_start: minibatch_start + minibatch_size]) minibatch_start += minibatch_size - return zip(range(nb_batches), minibatches) + if (minibatch_start != n): + # Make a minibatch out of what is left + minibatches.append(idx_list[minibatch_start:]) + + return zip(range(len(minibatches)), minibatches) def get_dataset(name): @@ -446,11 +445,9 @@ def test_lstm( print 'Optimization' - kf_valid = get_minibatches_idx(len(valid[0]), - len(valid[0]) / valid_batch_size, + kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size, shuffle=True) - kf_test = get_minibatches_idx(len(test[0]), - len(test[0]) / valid_batch_size, + kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, shuffle=True) history_errs = [] @@ -469,8 +466,7 @@ def test_lstm( n_samples = 0 # Get new shuffled index for the training set. - kf = get_minibatches_idx(len(train[0]), len(train[0])/batch_size, - shuffle=True) + kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 From 04c02d4f889a989a5db6ab51bea81de39d8baa65 Mon Sep 17 00:00:00 2001 From: Pierre Luc Carrier Date: Tue, 13 Jan 2015 11:27:43 -0500 Subject: [PATCH 23/27] Fixed default dataset value in load_data() --- code/imdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/imdb.py b/code/imdb.py index c33884d6..085ab3f9 100644 --- a/code/imdb.py +++ b/code/imdb.py @@ -87,7 +87,7 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1): # Load the dataset path = get_dataset_file( - path, "imdb.pkl.gz", + path, "imdb.pkl", "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") if path.endswith(".gz"): From 74b2e0c75e5a1be863c890510898ea42d2cf14fd Mon Sep 17 00:00:00 2001 From: Frederic Date: Tue, 13 Jan 2015 10:34:43 -0500 Subject: [PATCH 24/27] Filter for the max seq len when we load the dataset --- code/imdb.py | 23 ++++++++++++++++++++--- code/lstm.py | 32 +++++++++++++++++--------------- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/code/imdb.py b/code/imdb.py index 085ab3f9..c9d150e2 100644 --- a/code/imdb.py +++ b/code/imdb.py @@ -74,11 +74,19 @@ def get_dataset_file(dataset, default_dataset, origin): return dataset -def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1): +def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None): ''' Loads the dataset - :type dataset: string - :param dataset: the path to the dataset (here IMDB) + :type path: String + :param path: The path to the dataset (here IMDB) + :type n_words: int + :param n_words: The number of word to keep in the vocabulary. + All extra words are set to unknow (1). + :type valid_portion: float + :param valid_portion: The proportion of the full train set used for + the validation set. + :type maxlen: None or positive int + :param maxlen: the max sequence length we use in the train/valid set. ''' ############# @@ -98,6 +106,15 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1): train_set = cPickle.load(f) test_set = cPickle.load(f) f.close() + if maxlen: + new_train_set_x = [] + new_train_set_y = [] + for x, y in zip(train_set[0], train_set[1]): + if len(x) < maxlen: + new_train_set_x.append(x) + new_train_set_y.append(y) + train_set = (new_train_set_x, new_train_set_y) + del new_train_set_x, new_train_set_y # split training set into validation set train_set_x, train_set_y = train_set diff --git a/code/lstm.py b/code/lstm.py index 1946c0ad..995c91a8 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -373,22 +373,22 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False): return valid_err -def test_lstm( +def train_lstm( dim_proj=128, # word embeding dimension and LSTM number of hidden units. - patience=10, # number of epoch to wait before early stop if no progress + patience=10, # Number of epoch to wait before early stop if no progress max_epochs=5000, # The maximum number of epoch to run - dispFreq=10, # display to stdout the training progress every N updates - decay_c=0., # weight decay for the classifier applied to the U weights. - lrate=0.0001, # learning rate for sgd (not used for adadelta and rmsprop) - n_words=10000, # vocabulary size - optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decay learning rate). + dispFreq=10, # Display to stdout the training progress every N updates + decay_c=0., # Weight decay for the classifier applied to the U weights. + lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) + n_words=10000, # Vocabulary size + optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there - validFreq=10000, # after 1000 - saveFreq=100000, # save the parameters after every saveFreq updates - maxlen=100, # longer sequence get ignored - batch_size=64, # the batch size during training. - valid_batch_size=64, # The batch size during validation + validFreq=390, # Compute the validation error after this number of update. + saveFreq=1040, # Save the parameters after every saveFreq updates + maxlen=100, # Sequence longer then this get ignored + batch_size=16, # The batch size during training. + valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option @@ -400,11 +400,13 @@ def test_lstm( # Model options model_options = locals().copy() + print "model options", model_options load_data, prepare_data = get_dataset(dataset) print 'Loading data' - train, valid, test = load_data(n_words=n_words, valid_portion=0.01) + train, valid, test = load_data(n_words=n_words, valid_portion=0.01, + maxlen=maxlen) ydim = numpy.max(train[1])+1 @@ -569,7 +571,7 @@ def test_lstm( theano.config.scan.allow_gc = False # See function train for all possible parameter and there definition. - test_lstm( + train_lstm( #reload_model="lstm_model.npz", - max_epochs=10, + max_epochs=100, ) From 43adeff4754cdd15340bb4cf8a7e53c88af58a9f Mon Sep 17 00:00:00 2001 From: Frederic Date: Tue, 13 Jan 2015 10:36:47 -0500 Subject: [PATCH 25/27] use an higher valid proportion, to make it move. --- code/lstm.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/lstm.py b/code/lstm.py index 995c91a8..6762ef93 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -405,7 +405,7 @@ def train_lstm( load_data, prepare_data = get_dataset(dataset) print 'Loading data' - train, valid, test = load_data(n_words=n_words, valid_portion=0.01, + train, valid, test = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) ydim = numpy.max(train[1])+1 @@ -452,6 +452,9 @@ def train_lstm( kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, shuffle=True) + print "%d train examples" % len(train[0]) + print "%d valid examples" % len(valid[0]) + print "%d test examples" % len(test[0]) history_errs = [] best_p = None bad_count = 0 From 2da912206d3d9987d3d2d48b0d647405273be6d4 Mon Sep 17 00:00:00 2001 From: Frederic Date: Tue, 13 Jan 2015 10:43:40 -0500 Subject: [PATCH 26/27] catch ctrl-C --- code/lstm.py | 157 ++++++++++++++++++++++++++------------------------- 1 file changed, 81 insertions(+), 76 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 6762ef93..431c962f 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -384,8 +384,8 @@ def train_lstm( optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there - validFreq=390, # Compute the validation error after this number of update. - saveFreq=1040, # Save the parameters after every saveFreq updates + validFreq=370, # Compute the validation error after this number of update. + saveFreq=1110, # Save the parameters after every saveFreq updates maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. @@ -467,80 +467,85 @@ def train_lstm( uidx = 0 # the number of update done estop = False # early stop start_time = time.clock() - for eidx in xrange(max_epochs): - n_samples = 0 - - # Get new shuffled index for the training set. - kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) - - for _, train_index in kf: - uidx += 1 - use_noise.set_value(1.) - - # Select the random examples for this minibatch - y = [train[1][t] for t in train_index] - x = [train[0][t]for t in train_index] - - # Get the data in numpy.ndarray formet. - # It return something of the shape (minibatch maxlen, n samples) - x, mask, y = prepare_data(x, y, maxlen=maxlen) - if x is None: - print 'Minibatch with zero sample under length ', maxlen - continue - n_samples += x.shape[1] - - cost = f_grad_shared(x, mask, y) - f_update(lrate) - - if numpy.isnan(cost) or numpy.isinf(cost): - print 'NaN detected' - return 1., 1., 1. - - if numpy.mod(uidx, dispFreq) == 0: - print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost - - if numpy.mod(uidx, saveFreq) == 0: - print 'Saving...', - - if best_p is not None: - params = best_p - else: - params = unzip(tparams) - numpy.savez(saveto, history_errs=history_errs, **params) - pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) - print 'Done' - - if numpy.mod(uidx, validFreq) == 0: - use_noise.set_value(0.) - train_err = pred_error(f_pred, prepare_data, train, kf) - valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) - test_err = pred_error(f_pred, prepare_data, test, kf_test) - - history_errs.append([valid_err, test_err]) - - if (uidx == 0 or - valid_err <= numpy.array(history_errs)[:, - 0].min()): - - best_p = unzip(tparams) - bad_counter = 0 - - print ('Train ', train_err, 'Valid ', valid_err, - 'Test ', test_err) - - if (len(history_errs) > patience and - valid_err >= numpy.array(history_errs)[:-patience, - 0].min()): - bad_counter += 1 - if bad_counter > patience: - print 'Early Stop!' - estop = True - break - - print 'Seen %d samples' % n_samples - - if estop: - break + try: + for eidx in xrange(max_epochs): + n_samples = 0 + + # Get new shuffled index for the training set. + kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) + + for _, train_index in kf: + uidx += 1 + use_noise.set_value(1.) + + # Select the random examples for this minibatch + y = [train[1][t] for t in train_index] + x = [train[0][t]for t in train_index] + + # Get the data in numpy.ndarray formet. + # It return something of the shape (minibatch maxlen, n samples) + x, mask, y = prepare_data(x, y, maxlen=maxlen) + if x is None: + print 'Minibatch with zero sample under length ', maxlen + continue + n_samples += x.shape[1] + + cost = f_grad_shared(x, mask, y) + f_update(lrate) + + if numpy.isnan(cost) or numpy.isinf(cost): + print 'NaN detected' + return 1., 1., 1. + + if numpy.mod(uidx, dispFreq) == 0: + print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost + + if numpy.mod(uidx, saveFreq) == 0: + print 'Saving...', + + if best_p is not None: + params = best_p + else: + params = unzip(tparams) + numpy.savez(saveto, history_errs=history_errs, **params) + pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) + print 'Done' + + if numpy.mod(uidx, validFreq) == 0: + use_noise.set_value(0.) + train_err = pred_error(f_pred, prepare_data, train, kf) + valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) + test_err = pred_error(f_pred, prepare_data, test, kf_test) + + history_errs.append([valid_err, test_err]) + + if (uidx == 0 or + valid_err <= numpy.array(history_errs)[:, + 0].min()): + + best_p = unzip(tparams) + bad_counter = 0 + + print ('Train ', train_err, 'Valid ', valid_err, + 'Test ', test_err) + + if (len(history_errs) > patience and + valid_err >= numpy.array(history_errs)[:-patience, + 0].min()): + bad_counter += 1 + if bad_counter > patience: + print 'Early Stop!' + estop = True + break + + print 'Seen %d samples' % n_samples + + if estop: + break + + except KeyboardInterrupt: + print "Training interupted" + end_time = time.clock() if best_p is not None: zipp(best_p, tparams) From 5482b180e327165168eb62a47c06e20c7d7425c4 Mon Sep 17 00:00:00 2001 From: Frederic Date: Tue, 13 Jan 2015 10:44:01 -0500 Subject: [PATCH 27/27] small clean up --- code/lstm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/code/lstm.py b/code/lstm.py index 431c962f..00279ce0 100644 --- a/code/lstm.py +++ b/code/lstm.py @@ -559,11 +559,9 @@ def train_lstm( print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err - params = copy.copy(best_p) - numpy.savez(saveto, zipped_params=best_p, train_err=train_err, + numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, - history_errs=history_errs, **params) - + history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' %