Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1c08ba8
better loading of imdb dataset
nouiz Jan 8, 2015
859e9c3
don't use eval anymore
nouiz Jan 8, 2015
e57dd03
force floatX=float32, as otherwise there is problem. The learning rat…
nouiz Jan 8, 2015
e85246f
lstm: add comments
nouiz Jan 8, 2015
4eeee98
lstm: remove rconv code
nouiz Jan 8, 2015
6fe4fa0
Code simplification.
nouiz Jan 8, 2015
6b7d587
Fix typo, add docstring, add timming, remove useless printing
nouiz Jan 9, 2015
1d12bee
use the not compressed version of imdb. This take 1s to load instead …
nouiz Jan 9, 2015
f96d201
remove import not used
nouiz Jan 9, 2015
9942cb8
add name to fct
nouiz Jan 9, 2015
1e6bce2
add comment
nouiz Jan 9, 2015
1b17e87
pep8 printing
nouiz Jan 9, 2015
64eeb12
code simplification
nouiz Jan 9, 2015
86e5c4b
move sgd and comments
nouiz Jan 10, 2015
c480d4e
remove fflayers
nouiz Jan 10, 2015
2e022a6
small fixes and doc
nouiz Jan 12, 2015
8afe749
small update
nouiz Jan 12, 2015
3d9b1ac
fix the display of the number of example seen
nouiz Jan 12, 2015
401a99a
Add a way to reload pretrained model
nouiz Jan 13, 2015
6b7b7a6
use adadelta, sgd do not work.
nouiz Jan 13, 2015
194adad
Add the script that created the preprocessed imdb dataset
nouiz Jan 13, 2015
c6fdcff
Fixed function get_minibatches_idx()
carriepl Jan 13, 2015
04c02d4
Fixed default dataset value in load_data()
carriepl Jan 13, 2015
74b2e0c
Filter for the max seq len when we load the dataset
nouiz Jan 13, 2015
43adeff
use an higher valid proportion, to make it move.
nouiz Jan 13, 2015
2da9122
catch ctrl-C
nouiz Jan 13, 2015
5482b18
small clean up
nouiz Jan 13, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 64 additions & 9 deletions code/imdb.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
import cPickle
import gzip
import os
import sys
import time

import numpy

import theano
import theano.tensor as T


def prepare_data(seqs, labels, maxlen=None):
"""Create the matrices from the datasets.

This pad each sequence to the same lenght: the lenght of the
longuest sequence or maxlen.

if maxlen is set, we will cut all sequence to this maximum
lenght.

"""
# x: a list of sentences
lengths = [len(s) for s in seqs]

Expand Down Expand Up @@ -42,24 +48,73 @@ def prepare_data(seqs, labels, maxlen=None):
return x, x_mask, labels


def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
def get_dataset_file(dataset, default_dataset, origin):
'''Look for it as if it was a full path, if not, try local file,
if not try in the data directory.

Download dataset if it is not present

'''
data_dir, data_file = os.path.split(dataset)
if data_dir == "" and not os.path.isfile(dataset):
# Check if dataset is in the data directory.
new_path = os.path.join(
os.path.split(__file__)[0],
"..",
"data",
dataset
)
if os.path.isfile(new_path) or data_file == default_dataset:
dataset = new_path

if (not os.path.isfile(dataset)) and data_file == default_dataset:
import urllib
print 'Downloading data from %s' % origin
urllib.urlretrieve(origin, dataset)
return dataset


def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
''' Loads the dataset

:type dataset: string
:param dataset: the path to the dataset (here IMDB)
:type path: String
:param path: The path to the dataset (here IMDB)
:type n_words: int
:param n_words: The number of word to keep in the vocabulary.
All extra words are set to unknow (1).
:type valid_portion: float
:param valid_portion: The proportion of the full train set used for
the validation set.
:type maxlen: None or positive int
:param maxlen: the max sequence length we use in the train/valid set.
'''

#############
# LOAD DATA #
#############

print '... loading data'

# Load the dataset
f = open(path, 'rb')
path = get_dataset_file(
path, "imdb.pkl",
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")

if path.endswith(".gz"):
f = gzip.open(path, 'rb')
else:
f = open(path, 'rb')

train_set = cPickle.load(f)
test_set = cPickle.load(f)
f.close()
if maxlen:
new_train_set_x = []
new_train_set_y = []
for x, y in zip(train_set[0], train_set[1]):
if len(x) < maxlen:
new_train_set_x.append(x)
new_train_set_y.append(y)
train_set = (new_train_set_x, new_train_set_y)
del new_train_set_x, new_train_set_y

# split training set into validation set
train_set_x, train_set_y = train_set
Expand Down
123 changes: 123 additions & 0 deletions code/imdb_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""
This script is what created the dataset pickled.

1) You need to download this file and put it in the same directory as this file.
https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.

2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.

3) Then run this script.
"""

dataset_path='/Tmp/bastienf/aclImdb/'

import numpy
import cPickle as pkl

from collections import OrderedDict

import glob
import os

from subprocess import Popen, PIPE

# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']


def tokenize(sentences):

print 'Tokenizing..',
text = "\n".join(sentences)
tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
tok_text, _ = tokenizer.communicate(text)
toks = tok_text.split('\n')[:-1]
print 'Done'

return toks


def build_dict(path):
sentences = []
currdir = os.getcwd()
os.chdir('%s/pos/' % path)
for ff in glob.glob("*.txt"):
with open(ff, 'r') as f:
sentences.append(f.readline().strip())
os.chdir('%s/neg/' % path)
for ff in glob.glob("*.txt"):
with open(ff, 'r') as f:
sentences.append(f.readline().strip())
os.chdir(currdir)

sentences = tokenize(sentences)

print 'Building dictionary..',
wordcount = dict()
for ss in sentences:
words = ss.strip().lower().split()
for w in words:
if w not in wordcount:
wordcount[w] = 1
else:
wordcount[w] += 1

counts = wordcount.values()
keys = wordcount.keys()

sorted_idx = numpy.argsort(counts)[::-1]

worddict = dict()

for idx, ss in enumerate(sorted_idx):
worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK)

print numpy.sum(counts), ' total words ', len(keys), ' unique words'

return worddict


def grab_data(path, dictionary):
sentences = []
currdir = os.getcwd()
os.chdir(path)
for ff in glob.glob("*.txt"):
with open(ff, 'r') as f:
sentences.append(f.readline().strip())
os.chdir(currdir)
sentences = tokenize(sentences)

seqs = [None] * len(sentences)
for idx, ss in enumerate(sentences):
words = ss.strip().lower().split()
seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]

return seqs


def main():
# Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
path = dataset_path
dictionary = build_dict(os.path.join(path, 'train'))

train_x_pos = grab_data(path+'train/pos', dictionary)
train_x_neg = grab_data(path+'train/neg', dictionary)
train_x = train_x_pos + train_x_neg
train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)

test_x_pos = grab_data(path+'test/pos', dictionary)
test_x_neg = grab_data(path+'test/neg', dictionary)
test_x = test_x_pos + test_x_neg
test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

f = open('imdb.pkl', 'wb')
pkl.dump((train_x, train_y), f, -1)
pkl.dump((test_x, test_y), f, -1)
f.close()

f = open('imdb.dict.pkl', 'wb')
pkl.dump(dictionary, f, -1)
f.close()

if __name__ == '__main__':
main()
Loading