Add files via upload

kk7nc · web-flow · commit 2da564caec36 · 2018-07-20T01:45:14.000-04:00
diff --git a/code/Hierarchical_Attention_Networks/textClassifierConv.py b/code/Hierarchical_Attention_Networks/textClassifierConv.py
@@ -0,0 +1,169 @@
+# author - Richard Liao
+# Dec 26 2016
+import numpy as np
+import pandas as pd
+import cPickle
+from collections import defaultdict
+import re
+
+from bs4 import BeautifulSoup
+
+import sys
+import os
+
+os.environ['KERAS_BACKEND']='theano'
+
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils.np_utils import to_categorical
+
+from keras.layers import Embedding
+from keras.layers import Dense, Input, Flatten
+from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
+from keras.models import Model
+
+MAX_SEQUENCE_LENGTH = 1000
+MAX_NB_WORDS = 20000
+EMBEDDING_DIM = 100
+VALIDATION_SPLIT = 0.2
+
+def clean_str(string):
+    """
+    Tokenization/string cleaning for dataset
+    Every dataset is lower cased except
+    """
+    string = re.sub(r"\\", "", string)    
+    string = re.sub(r"\'", "", string)    
+    string = re.sub(r"\"", "", string)    
+    return string.strip().lower()
+
+data_train = pd.read_csv('~/Testground/data/imdb/labeledTrainData.tsv', sep='\t')
+print data_train.shape
+
+texts = []
+labels = []
+
+for idx in range(data_train.review.shape[0]):
+    text = BeautifulSoup(data_train.review[idx])
+    texts.append(clean_str(text.get_text().encode('ascii','ignore')))
+    labels.append(data_train.sentiment[idx])
+    
+
+tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
+tokenizer.fit_on_texts(texts)
+sequences = tokenizer.texts_to_sequences(texts)
+
+word_index = tokenizer.word_index
+print('Found %s unique tokens.' % len(word_index))
+
+data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
+
+labels = to_categorical(np.asarray(labels))
+print('Shape of data tensor:', data.shape)
+print('Shape of label tensor:', labels.shape)
+
+indices = np.arange(data.shape[0])
+np.random.shuffle(indices)
+data = data[indices]
+labels = labels[indices]
+nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
+
+x_train = data[:-nb_validation_samples]
+y_train = labels[:-nb_validation_samples]
+x_val = data[-nb_validation_samples:]
+y_val = labels[-nb_validation_samples:]
+
+print('Number of positive and negative reviews in traing and validation set ')
+print y_train.sum(axis=0)
+print y_val.sum(axis=0)
+
+GLOVE_DIR = "/ext/home/analyst/Testground/data/glove"
+embeddings_index = {}
+f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
+for line in f:
+    values = line.split()
+    word = values[0]
+    coefs = np.asarray(values[1:], dtype='float32')
+    embeddings_index[word] = coefs
+f.close()
+
+print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))
+
+embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
+for word, i in word_index.items():
+    embedding_vector = embeddings_index.get(word)
+    if embedding_vector is not None:
+        # words not found in embedding index will be all-zeros.
+        embedding_matrix[i] = embedding_vector
+        
+embedding_layer = Embedding(len(word_index) + 1,
+                            EMBEDDING_DIM,
+                            weights=[embedding_matrix],
+                            input_length=MAX_SEQUENCE_LENGTH,
+                            trainable=True)
+
+sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
+embedded_sequences = embedding_layer(sequence_input)
+l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
+l_pool1 = MaxPooling1D(5)(l_cov1)
+l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
+l_pool2 = MaxPooling1D(5)(l_cov2)
+l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
+l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
+l_flat = Flatten()(l_pool3)
+l_dense = Dense(128, activation='relu')(l_flat)
+preds = Dense(2, activation='softmax')(l_dense)
+
+model = Model(sequence_input, preds)
+model.compile(loss='categorical_crossentropy',
+              optimizer='rmsprop',
+              metrics=['acc'])
+
+print("model fitting - simplified convolutional neural network")
+model.summary()
+model.fit(x_train, y_train, validation_data=(x_val, y_val),
+          nb_epoch=10, batch_size=128)
+
+embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
+for word, i in word_index.items():
+    embedding_vector = embeddings_index.get(word)
+    if embedding_vector is not None:
+        # words not found in embedding index will be all-zeros.
+        embedding_matrix[i] = embedding_vector
+        
+embedding_layer = Embedding(len(word_index) + 1,
+                            EMBEDDING_DIM,
+                            weights=[embedding_matrix],
+                            input_length=MAX_SEQUENCE_LENGTH,
+                            trainable=True)
+
+# applying a more complex convolutional approach
+convs = []
+filter_sizes = [3,4,5]
+
+sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
+embedded_sequences = embedding_layer(sequence_input)
+
+for fsz in filter_sizes:
+    l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
+    l_pool = MaxPooling1D(5)(l_conv)
+    convs.append(l_pool)
+    
+l_merge = Merge(mode='concat', concat_axis=1)(convs)
+l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
+l_pool1 = MaxPooling1D(5)(l_cov1)
+l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
+l_pool2 = MaxPooling1D(30)(l_cov2)
+l_flat = Flatten()(l_pool2)
+l_dense = Dense(128, activation='relu')(l_flat)
+preds = Dense(2, activation='softmax')(l_dense)
+
+model = Model(sequence_input, preds)
+model.compile(loss='categorical_crossentropy',
+              optimizer='rmsprop',
+              metrics=['acc'])
+
+print("model fitting - more complex convolutional neural network")
+model.summary()
+model.fit(x_train, y_train, validation_data=(x_val, y_val),
+          nb_epoch=20, batch_size=50)
diff --git a/code/Hierarchical_Attention_Networks/textClassifierHATT.py b/code/Hierarchical_Attention_Networks/textClassifierHATT.py
@@ -0,0 +1,206 @@
+# author - Richard Liao 
+# Dec 26 2016
+import numpy as np
+import pandas as pd
+import cPickle
+from collections import defaultdict
+import re
+
+from bs4 import BeautifulSoup
+
+import sys
+import os
+
+os.environ['KERAS_BACKEND']='theano'
+
+from keras.preprocessing.text import Tokenizer, text_to_word_sequence
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils.np_utils import to_categorical
+
+from keras.layers import Embedding
+from keras.layers import Dense, Input, Flatten
+from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
+from keras.models import Model
+
+from keras import backend as K
+from keras.engine.topology import Layer, InputSpec
+from keras import initializations
+
+MAX_SENT_LENGTH = 100
+MAX_SENTS = 15
+MAX_NB_WORDS = 20000
+EMBEDDING_DIM = 100
+VALIDATION_SPLIT = 0.2
+
+def clean_str(string):
+    """
+    Tokenization/string cleaning for dataset
+    Every dataset is lower cased except
+    """
+    string = re.sub(r"\\", "", string)    
+    string = re.sub(r"\'", "", string)    
+    string = re.sub(r"\"", "", string)    
+    return string.strip().lower()
+
+data_train = pd.read_csv('~/Testground/data/imdb/labeledTrainData.tsv', sep='\t')
+print data_train.shape
+
+from nltk import tokenize
+
+reviews = []
+labels = []
+texts = []
+
+for idx in range(data_train.review.shape[0]):
+    text = BeautifulSoup(data_train.review[idx])
+    text = clean_str(text.get_text().encode('ascii','ignore'))
+    texts.append(text)
+    sentences = tokenize.sent_tokenize(text)
+    reviews.append(sentences)
+    
+    labels.append(data_train.sentiment[idx])
+
+tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
+tokenizer.fit_on_texts(texts)
+
+data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
+
+for i, sentences in enumerate(reviews):
+    for j, sent in enumerate(sentences):
+        if j< MAX_SENTS:
+            wordTokens = text_to_word_sequence(sent)
+            k=0
+            for _, word in enumerate(wordTokens):
+                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
+                    data[i,j,k] = tokenizer.word_index[word]
+                    k=k+1                    
+                    
+word_index = tokenizer.word_index
+print('Total %s unique tokens.' % len(word_index))
+
+labels = to_categorical(np.asarray(labels))
+print('Shape of data tensor:', data.shape)
+print('Shape of label tensor:', labels.shape)
+
+indices = np.arange(data.shape[0])
+np.random.shuffle(indices)
+data = data[indices]
+labels = labels[indices]
+nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
+
+x_train = data[:-nb_validation_samples]
+y_train = labels[:-nb_validation_samples]
+x_val = data[-nb_validation_samples:]
+y_val = labels[-nb_validation_samples:]
+
+print('Number of positive and negative reviews in traing and validation set')
+print y_train.sum(axis=0)
+print y_val.sum(axis=0)
+
+GLOVE_DIR = "/ext/home/analyst/Testground/data/glove"
+embeddings_index = {}
+f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
+for line in f:
+    values = line.split()
+    word = values[0]
+    coefs = np.asarray(values[1:], dtype='float32')
+    embeddings_index[word] = coefs
+f.close()
+
+print('Total %s word vectors.' % len(embeddings_index))
+
+embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
+for word, i in word_index.items():
+    embedding_vector = embeddings_index.get(word)
+    if embedding_vector is not None:
+        # words not found in embedding index will be all-zeros.
+        embedding_matrix[i] = embedding_vector
+        
+embedding_layer = Embedding(len(word_index) + 1,
+                            EMBEDDING_DIM,
+                            weights=[embedding_matrix],
+                            input_length=MAX_SENT_LENGTH,
+                            trainable=True)
+
+sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
+embedded_sequences = embedding_layer(sentence_input)
+l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
+sentEncoder = Model(sentence_input, l_lstm)
+
+review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
+review_encoder = TimeDistributed(sentEncoder)(review_input)
+l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
+preds = Dense(2, activation='softmax')(l_lstm_sent)
+model = Model(review_input, preds)
+
+model.compile(loss='categorical_crossentropy',
+              optimizer='rmsprop',
+              metrics=['acc'])
+
+print("model fitting - Hierachical LSTM")
+print model.summary()
+model.fit(x_train, y_train, validation_data=(x_val, y_val),
+          nb_epoch=10, batch_size=50)
+
+# building Hierachical Attention network
+embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
+for word, i in word_index.items():
+    embedding_vector = embeddings_index.get(word)
+    if embedding_vector is not None:
+        # words not found in embedding index will be all-zeros.
+        embedding_matrix[i] = embedding_vector
+        
+embedding_layer = Embedding(len(word_index) + 1,
+                            EMBEDDING_DIM,
+                            weights=[embedding_matrix],
+                            input_length=MAX_SENT_LENGTH,
+                            trainable=True)
+
+class AttLayer(Layer):
+    def __init__(self, **kwargs):
+        self.init = initializations.get('normal')
+        #self.input_spec = [InputSpec(ndim=3)]
+        super(AttLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        assert len(input_shape)==3
+        #self.W = self.init((input_shape[-1],1))
+        self.W = self.init((input_shape[-1],))
+        #self.input_spec = [InputSpec(shape=input_shape)]
+        self.trainable_weights = [self.W]
+        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!
+
+    def call(self, x, mask=None):
+        eij = K.tanh(K.dot(x, self.W))
+        
+        ai = K.exp(eij)
+        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
+        
+        weighted_input = x*weights.dimshuffle(0,1,'x')
+        return weighted_input.sum(axis=1)
+
+    def get_output_shape_for(self, input_shape):
+        return (input_shape[0], input_shape[-1])
+
+sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
+embedded_sequences = embedding_layer(sentence_input)
+l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
+l_dense = TimeDistributed(Dense(200))(l_lstm)
+l_att = AttLayer()(l_dense)
+sentEncoder = Model(sentence_input, l_att)
+
+review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
+review_encoder = TimeDistributed(sentEncoder)(review_input)
+l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
+l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
+l_att_sent = AttLayer()(l_dense_sent)
+preds = Dense(2, activation='softmax')(l_att_sent)
+model = Model(review_input, preds)
+
+model.compile(loss='categorical_crossentropy',
+              optimizer='rmsprop',
+              metrics=['acc'])
+
+print("model fitting - Hierachical attention network")
+model.fit(x_train, y_train, validation_data=(x_val, y_val),
+          nb_epoch=10, batch_size=50)
diff --git a/code/Hierarchical_Attention_Networks/textClassifierRNN.py b/code/Hierarchical_Attention_Networks/textClassifierRNN.py