from keras.layers import Dropout, Dense, GRU, Embedding from keras.models import Sequential from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from sklearn import metrics from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.datasets import fetch_20newsgroups def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500): np.random.seed(7) text = np.concatenate((X_train, X_test), axis=0) text = np.array(text) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(text) sequences = tokenizer.texts_to_sequences(text) word_index = tokenizer.word_index text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Found %s unique tokens.' % len(word_index)) indices = np.arange(text.shape[0]) # np.random.shuffle(indices) text = text[indices] print(text.shape) X_train = text[0:len(X_train), ] X_test = text[len(X_train):, ] embeddings_index = {} f = open("C:\\Users\\kamran\\Documents\\GitHub\\RMDL\\Examples\\Glove\\glove.6B.50d.txt", encoding="utf8") for line in f: values = line.split() word = values[0] try: coefs = np.asarray(values[1:], dtype='float32') except: pass embeddings_index[word] = coefs f.close() print('Total %s word vectors.' % len(embeddings_index)) return (X_train, X_test, word_index,embeddings_index) def Build_Model_RNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5): """ def buildModel_RNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5): word_index in word index , embeddings_index is embeddings index, look at data_helper.py nClasses is number of classes, MAX_SEQUENCE_LENGTH is maximum lenght of text sequences """ model = Sequential() hidden_layer = 3 gru_node = 256 embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. if len(embedding_matrix[i]) != len(embedding_vector): print("could not broadcast input array from shape", str(len(embedding_matrix[i])), "into shape", str(len(embedding_vector)), " Please make sure your" " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,") exit(1) embedding_matrix[i] = embedding_vector model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True)) print(gru_node) for i in range(0,hidden_layer): model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2)) model.add(Dropout(dropout)) model.add(GRU(gru_node, recurrent_dropout=0.2)) #model.add(Dense(, activation='relu')) model.add(Dense(nclasses, activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') X_train = newsgroups_train.data X_test = newsgroups_test.data y_train = newsgroups_train.target y_test = newsgroups_test.target X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test) model_RNN = Build_Model_RNN_Text(word_index,embeddings_index, 20) model_RNN.summary() model_RNN.fit(X_train_Glove, y_train, validation_data=(X_test_Glove, y_test), epochs=20, batch_size=128, verbose=2) predicted = model_RNN.predict_classes(X_test_Glove) print(metrics.classification_report(y_test, predicted))