from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers import Embedding from keras.layers import LSTM from keras.layers import Conv1D, MaxPooling1D from keras.datasets import imdb from sklearn.datasets import fetch_20newsgroups import numpy as np from sklearn import metrics from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500): np.random.seed(7) text = np.concatenate((X_train, X_test), axis=0) text = np.array(text) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(text) sequences = tokenizer.texts_to_sequences(text) word_index = tokenizer.word_index text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Found %s unique tokens.' % len(word_index)) indices = np.arange(text.shape[0]) # np.random.shuffle(indices) text = text[indices] print(text.shape) X_train = text[0:len(X_train), ] X_test = text[len(X_train):, ] embeddings_index = {} f = open(".\glove.6B.100d.txt", encoding="utf8") for line in f: values = line.split() word = values[0] try: coefs = np.asarray(values[1:], dtype='float32') except: pass embeddings_index[word] = coefs f.close() print('Total %s word vectors.' % len(embeddings_index)) return (X_train, X_test, word_index,embeddings_index) def Build_Model_RCNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=100): kernel_size = 2 filters = 256 pool_size = 2 gru_node = 256 embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. if len(embedding_matrix[i]) !=len(embedding_vector): print("could not broadcast input array from shape",str(len(embedding_matrix[i])), "into shape",str(len(embedding_vector))," Please make sure your" " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,") exit(1) embedding_matrix[i] = embedding_vector model = Sequential() model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True)) model.add(Dropout(0.25)) model.add(Conv1D(filters, kernel_size, activation='relu')) model.add(MaxPooling1D(pool_size=pool_size)) model.add(Conv1D(filters, kernel_size, activation='relu')) model.add(MaxPooling1D(pool_size=pool_size)) model.add(Conv1D(filters, kernel_size, activation='relu')) model.add(MaxPooling1D(pool_size=pool_size)) model.add(Conv1D(filters, kernel_size, activation='relu')) model.add(MaxPooling1D(pool_size=pool_size)) model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)) model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)) model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)) model.add(LSTM(gru_node, recurrent_dropout=0.2)) model.add(Dense(1024,activation='relu')) model.add(Dense(nclasses)) model.add(Activation('softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') X_train = newsgroups_train.data X_test = newsgroups_test.data y_train = newsgroups_train.target y_test = newsgroups_test.target X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test) model_RCNN = Build_Model_RCNN_Text(word_index,embeddings_index, 20) model_RCNN.summary() model_RCNN.fit(X_train_Glove, y_train, validation_data=(X_test_Glove, y_test), epochs=15, batch_size=128, verbose=2) predicted = model_RCNN.predict(X_test_Glove) predicted = np.argmax(predicted, axis=1) print(metrics.classification_report(y_test, predicted))