Skip to content

Commit 1c08ba8

Browse files
committed
better loading of imdb dataset
1 parent be59a2b commit 1c08ba8

1 file changed

Lines changed: 32 additions & 4 deletions

File tree

code/imdb.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,33 @@ def prepare_data(seqs, labels, maxlen=None):
4242
return x, x_mask, labels
4343

4444

45-
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
45+
def get_dataset_file(dataset, default_dataset, origin):
46+
'''Look for it as if it was a full path, if not, try local file,
47+
if not try in the data directory.
48+
49+
Download dataset if it is not present
50+
51+
'''
52+
data_dir, data_file = os.path.split(dataset)
53+
if data_dir == "" and not os.path.isfile(dataset):
54+
# Check if dataset is in the data directory.
55+
new_path = os.path.join(
56+
os.path.split(__file__)[0],
57+
"..",
58+
"data",
59+
dataset
60+
)
61+
if os.path.isfile(new_path) or data_file == default_dataset:
62+
dataset = new_path
63+
64+
if (not os.path.isfile(dataset)) and data_file == default_dataset:
65+
import urllib
66+
print 'Downloading data from %s' % origin
67+
urllib.urlretrieve(origin, dataset)
68+
return dataset
69+
70+
71+
def load_data(path="imdb.pkl.gz", n_words=100000, valid_portion=0.1):
4672
''' Loads the dataset
4773
4874
:type dataset: string
@@ -53,10 +79,12 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1):
5379
# LOAD DATA #
5480
#############
5581

56-
print '... loading data'
57-
5882
# Load the dataset
59-
f = open(path, 'rb')
83+
path = get_dataset_file(
84+
path, "imdb.pkl.gz",
85+
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz")
86+
87+
f = gzip.open(path, 'rb')
6088
train_set = cPickle.load(f)
6189
test_set = cPickle.load(f)
6290
f.close()

0 commit comments

Comments
 (0)