Skip to content

Commit 2da564c

Browse files
authored
Add files via upload
1 parent 8da5b7a commit 2da564c

3 files changed

Lines changed: 553 additions & 0 deletions

File tree

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# author - Richard Liao
2+
# Dec 26 2016
3+
import numpy as np
4+
import pandas as pd
5+
import cPickle
6+
from collections import defaultdict
7+
import re
8+
9+
from bs4 import BeautifulSoup
10+
11+
import sys
12+
import os
13+
14+
os.environ['KERAS_BACKEND']='theano'
15+
16+
from keras.preprocessing.text import Tokenizer
17+
from keras.preprocessing.sequence import pad_sequences
18+
from keras.utils.np_utils import to_categorical
19+
20+
from keras.layers import Embedding
21+
from keras.layers import Dense, Input, Flatten
22+
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
23+
from keras.models import Model
24+
25+
MAX_SEQUENCE_LENGTH = 1000
26+
MAX_NB_WORDS = 20000
27+
EMBEDDING_DIM = 100
28+
VALIDATION_SPLIT = 0.2
29+
30+
def clean_str(string):
31+
"""
32+
Tokenization/string cleaning for dataset
33+
Every dataset is lower cased except
34+
"""
35+
string = re.sub(r"\\", "", string)
36+
string = re.sub(r"\'", "", string)
37+
string = re.sub(r"\"", "", string)
38+
return string.strip().lower()
39+
40+
data_train = pd.read_csv('~/Testground/data/imdb/labeledTrainData.tsv', sep='\t')
41+
print data_train.shape
42+
43+
texts = []
44+
labels = []
45+
46+
for idx in range(data_train.review.shape[0]):
47+
text = BeautifulSoup(data_train.review[idx])
48+
texts.append(clean_str(text.get_text().encode('ascii','ignore')))
49+
labels.append(data_train.sentiment[idx])
50+
51+
52+
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
53+
tokenizer.fit_on_texts(texts)
54+
sequences = tokenizer.texts_to_sequences(texts)
55+
56+
word_index = tokenizer.word_index
57+
print('Found %s unique tokens.' % len(word_index))
58+
59+
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
60+
61+
labels = to_categorical(np.asarray(labels))
62+
print('Shape of data tensor:', data.shape)
63+
print('Shape of label tensor:', labels.shape)
64+
65+
indices = np.arange(data.shape[0])
66+
np.random.shuffle(indices)
67+
data = data[indices]
68+
labels = labels[indices]
69+
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
70+
71+
x_train = data[:-nb_validation_samples]
72+
y_train = labels[:-nb_validation_samples]
73+
x_val = data[-nb_validation_samples:]
74+
y_val = labels[-nb_validation_samples:]
75+
76+
print('Number of positive and negative reviews in traing and validation set ')
77+
print y_train.sum(axis=0)
78+
print y_val.sum(axis=0)
79+
80+
GLOVE_DIR = "/ext/home/analyst/Testground/data/glove"
81+
embeddings_index = {}
82+
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
83+
for line in f:
84+
values = line.split()
85+
word = values[0]
86+
coefs = np.asarray(values[1:], dtype='float32')
87+
embeddings_index[word] = coefs
88+
f.close()
89+
90+
print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))
91+
92+
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
93+
for word, i in word_index.items():
94+
embedding_vector = embeddings_index.get(word)
95+
if embedding_vector is not None:
96+
# words not found in embedding index will be all-zeros.
97+
embedding_matrix[i] = embedding_vector
98+
99+
embedding_layer = Embedding(len(word_index) + 1,
100+
EMBEDDING_DIM,
101+
weights=[embedding_matrix],
102+
input_length=MAX_SEQUENCE_LENGTH,
103+
trainable=True)
104+
105+
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
106+
embedded_sequences = embedding_layer(sequence_input)
107+
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
108+
l_pool1 = MaxPooling1D(5)(l_cov1)
109+
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
110+
l_pool2 = MaxPooling1D(5)(l_cov2)
111+
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
112+
l_pool3 = MaxPooling1D(35)(l_cov3) # global max pooling
113+
l_flat = Flatten()(l_pool3)
114+
l_dense = Dense(128, activation='relu')(l_flat)
115+
preds = Dense(2, activation='softmax')(l_dense)
116+
117+
model = Model(sequence_input, preds)
118+
model.compile(loss='categorical_crossentropy',
119+
optimizer='rmsprop',
120+
metrics=['acc'])
121+
122+
print("model fitting - simplified convolutional neural network")
123+
model.summary()
124+
model.fit(x_train, y_train, validation_data=(x_val, y_val),
125+
nb_epoch=10, batch_size=128)
126+
127+
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
128+
for word, i in word_index.items():
129+
embedding_vector = embeddings_index.get(word)
130+
if embedding_vector is not None:
131+
# words not found in embedding index will be all-zeros.
132+
embedding_matrix[i] = embedding_vector
133+
134+
embedding_layer = Embedding(len(word_index) + 1,
135+
EMBEDDING_DIM,
136+
weights=[embedding_matrix],
137+
input_length=MAX_SEQUENCE_LENGTH,
138+
trainable=True)
139+
140+
# applying a more complex convolutional approach
141+
convs = []
142+
filter_sizes = [3,4,5]
143+
144+
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
145+
embedded_sequences = embedding_layer(sequence_input)
146+
147+
for fsz in filter_sizes:
148+
l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
149+
l_pool = MaxPooling1D(5)(l_conv)
150+
convs.append(l_pool)
151+
152+
l_merge = Merge(mode='concat', concat_axis=1)(convs)
153+
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
154+
l_pool1 = MaxPooling1D(5)(l_cov1)
155+
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
156+
l_pool2 = MaxPooling1D(30)(l_cov2)
157+
l_flat = Flatten()(l_pool2)
158+
l_dense = Dense(128, activation='relu')(l_flat)
159+
preds = Dense(2, activation='softmax')(l_dense)
160+
161+
model = Model(sequence_input, preds)
162+
model.compile(loss='categorical_crossentropy',
163+
optimizer='rmsprop',
164+
metrics=['acc'])
165+
166+
print("model fitting - more complex convolutional neural network")
167+
model.summary()
168+
model.fit(x_train, y_train, validation_data=(x_val, y_val),
169+
nb_epoch=20, batch_size=50)
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# author - Richard Liao
2+
# Dec 26 2016
3+
import numpy as np
4+
import pandas as pd
5+
import cPickle
6+
from collections import defaultdict
7+
import re
8+
9+
from bs4 import BeautifulSoup
10+
11+
import sys
12+
import os
13+
14+
os.environ['KERAS_BACKEND']='theano'
15+
16+
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
17+
from keras.preprocessing.sequence import pad_sequences
18+
from keras.utils.np_utils import to_categorical
19+
20+
from keras.layers import Embedding
21+
from keras.layers import Dense, Input, Flatten
22+
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
23+
from keras.models import Model
24+
25+
from keras import backend as K
26+
from keras.engine.topology import Layer, InputSpec
27+
from keras import initializations
28+
29+
MAX_SENT_LENGTH = 100
30+
MAX_SENTS = 15
31+
MAX_NB_WORDS = 20000
32+
EMBEDDING_DIM = 100
33+
VALIDATION_SPLIT = 0.2
34+
35+
def clean_str(string):
36+
"""
37+
Tokenization/string cleaning for dataset
38+
Every dataset is lower cased except
39+
"""
40+
string = re.sub(r"\\", "", string)
41+
string = re.sub(r"\'", "", string)
42+
string = re.sub(r"\"", "", string)
43+
return string.strip().lower()
44+
45+
data_train = pd.read_csv('~/Testground/data/imdb/labeledTrainData.tsv', sep='\t')
46+
print data_train.shape
47+
48+
from nltk import tokenize
49+
50+
reviews = []
51+
labels = []
52+
texts = []
53+
54+
for idx in range(data_train.review.shape[0]):
55+
text = BeautifulSoup(data_train.review[idx])
56+
text = clean_str(text.get_text().encode('ascii','ignore'))
57+
texts.append(text)
58+
sentences = tokenize.sent_tokenize(text)
59+
reviews.append(sentences)
60+
61+
labels.append(data_train.sentiment[idx])
62+
63+
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
64+
tokenizer.fit_on_texts(texts)
65+
66+
data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
67+
68+
for i, sentences in enumerate(reviews):
69+
for j, sent in enumerate(sentences):
70+
if j< MAX_SENTS:
71+
wordTokens = text_to_word_sequence(sent)
72+
k=0
73+
for _, word in enumerate(wordTokens):
74+
if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
75+
data[i,j,k] = tokenizer.word_index[word]
76+
k=k+1
77+
78+
word_index = tokenizer.word_index
79+
print('Total %s unique tokens.' % len(word_index))
80+
81+
labels = to_categorical(np.asarray(labels))
82+
print('Shape of data tensor:', data.shape)
83+
print('Shape of label tensor:', labels.shape)
84+
85+
indices = np.arange(data.shape[0])
86+
np.random.shuffle(indices)
87+
data = data[indices]
88+
labels = labels[indices]
89+
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
90+
91+
x_train = data[:-nb_validation_samples]
92+
y_train = labels[:-nb_validation_samples]
93+
x_val = data[-nb_validation_samples:]
94+
y_val = labels[-nb_validation_samples:]
95+
96+
print('Number of positive and negative reviews in traing and validation set')
97+
print y_train.sum(axis=0)
98+
print y_val.sum(axis=0)
99+
100+
GLOVE_DIR = "/ext/home/analyst/Testground/data/glove"
101+
embeddings_index = {}
102+
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
103+
for line in f:
104+
values = line.split()
105+
word = values[0]
106+
coefs = np.asarray(values[1:], dtype='float32')
107+
embeddings_index[word] = coefs
108+
f.close()
109+
110+
print('Total %s word vectors.' % len(embeddings_index))
111+
112+
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
113+
for word, i in word_index.items():
114+
embedding_vector = embeddings_index.get(word)
115+
if embedding_vector is not None:
116+
# words not found in embedding index will be all-zeros.
117+
embedding_matrix[i] = embedding_vector
118+
119+
embedding_layer = Embedding(len(word_index) + 1,
120+
EMBEDDING_DIM,
121+
weights=[embedding_matrix],
122+
input_length=MAX_SENT_LENGTH,
123+
trainable=True)
124+
125+
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
126+
embedded_sequences = embedding_layer(sentence_input)
127+
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
128+
sentEncoder = Model(sentence_input, l_lstm)
129+
130+
review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
131+
review_encoder = TimeDistributed(sentEncoder)(review_input)
132+
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
133+
preds = Dense(2, activation='softmax')(l_lstm_sent)
134+
model = Model(review_input, preds)
135+
136+
model.compile(loss='categorical_crossentropy',
137+
optimizer='rmsprop',
138+
metrics=['acc'])
139+
140+
print("model fitting - Hierachical LSTM")
141+
print model.summary()
142+
model.fit(x_train, y_train, validation_data=(x_val, y_val),
143+
nb_epoch=10, batch_size=50)
144+
145+
# building Hierachical Attention network
146+
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
147+
for word, i in word_index.items():
148+
embedding_vector = embeddings_index.get(word)
149+
if embedding_vector is not None:
150+
# words not found in embedding index will be all-zeros.
151+
embedding_matrix[i] = embedding_vector
152+
153+
embedding_layer = Embedding(len(word_index) + 1,
154+
EMBEDDING_DIM,
155+
weights=[embedding_matrix],
156+
input_length=MAX_SENT_LENGTH,
157+
trainable=True)
158+
159+
class AttLayer(Layer):
160+
def __init__(self, **kwargs):
161+
self.init = initializations.get('normal')
162+
#self.input_spec = [InputSpec(ndim=3)]
163+
super(AttLayer, self).__init__(**kwargs)
164+
165+
def build(self, input_shape):
166+
assert len(input_shape)==3
167+
#self.W = self.init((input_shape[-1],1))
168+
self.W = self.init((input_shape[-1],))
169+
#self.input_spec = [InputSpec(shape=input_shape)]
170+
self.trainable_weights = [self.W]
171+
super(AttLayer, self).build(input_shape) # be sure you call this somewhere!
172+
173+
def call(self, x, mask=None):
174+
eij = K.tanh(K.dot(x, self.W))
175+
176+
ai = K.exp(eij)
177+
weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
178+
179+
weighted_input = x*weights.dimshuffle(0,1,'x')
180+
return weighted_input.sum(axis=1)
181+
182+
def get_output_shape_for(self, input_shape):
183+
return (input_shape[0], input_shape[-1])
184+
185+
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
186+
embedded_sequences = embedding_layer(sentence_input)
187+
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
188+
l_dense = TimeDistributed(Dense(200))(l_lstm)
189+
l_att = AttLayer()(l_dense)
190+
sentEncoder = Model(sentence_input, l_att)
191+
192+
review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
193+
review_encoder = TimeDistributed(sentEncoder)(review_input)
194+
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
195+
l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
196+
l_att_sent = AttLayer()(l_dense_sent)
197+
preds = Dense(2, activation='softmax')(l_att_sent)
198+
model = Model(review_input, preds)
199+
200+
model.compile(loss='categorical_crossentropy',
201+
optimizer='rmsprop',
202+
metrics=['acc'])
203+
204+
print("model fitting - Hierachical attention network")
205+
model.fit(x_train, y_train, validation_data=(x_val, y_val),
206+
nb_epoch=10, batch_size=50)

0 commit comments

Comments
 (0)