1+ from keras .layers import Dropout , Dense ,Input ,Embedding ,Flatten , MaxPooling1D , Conv1D
2+ from keras .models import Sequential ,Model
3+ from sklearn .feature_extraction .text import TfidfVectorizer
4+ import numpy as np
5+ from sklearn import metrics
6+ from keras .preprocessing .text import Tokenizer
7+ from keras .preprocessing .sequence import pad_sequences
8+ from sklearn .datasets import fetch_20newsgroups
9+ from keras .layers .merge import Concatenate
10+
11+
12+ def loadData_Tokenizer (X_train , X_test ,MAX_NB_WORDS = 75000 ,MAX_SEQUENCE_LENGTH = 500 ):
13+ np .random .seed (7 )
14+ text = np .concatenate ((X_train , X_test ), axis = 0 )
15+ text = np .array (text )
16+ tokenizer = Tokenizer (num_words = MAX_NB_WORDS )
17+ tokenizer .fit_on_texts (text )
18+ sequences = tokenizer .texts_to_sequences (text )
19+ word_index = tokenizer .word_index
20+ text = pad_sequences (sequences , maxlen = MAX_SEQUENCE_LENGTH )
21+ print ('Found %s unique tokens.' % len (word_index ))
22+ indices = np .arange (text .shape [0 ])
23+ # np.random.shuffle(indices)
24+ text = text [indices ]
25+ print (text .shape )
26+ X_train = text [0 :len (X_train ), ]
27+ X_test = text [len (X_train ):, ]
28+ embeddings_index = {}
29+ f = open ("C:\\ Users\\ kamran\\ Documents\\ GitHub\\ RMDL\\ Examples\\ Glove\\ glove.6B.50d.txt" , encoding = "utf8" )
30+ for line in f :
31+ values = line .split ()
32+ word = values [0 ]
33+ try :
34+ coefs = np .asarray (values [1 :], dtype = 'float32' )
35+ except :
36+ pass
37+ embeddings_index [word ] = coefs
38+ f .close ()
39+ print ('Total %s word vectors.' % len (embeddings_index ))
40+ return (X_train , X_test , word_index ,embeddings_index )
41+
42+
43+
44+ def Build_Model_CNN_Text (word_index , embeddings_index , nclasses , MAX_SEQUENCE_LENGTH = 500 , EMBEDDING_DIM = 50 , dropout = 0.5 ):
45+
46+ """
47+ def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
48+ word_index in word index ,
49+ embeddings_index is embeddings index, look at data_helper.py
50+ nClasses is number of classes,
51+ MAX_SEQUENCE_LENGTH is maximum lenght of text sequences,
52+ EMBEDDING_DIM is an int value for dimention of word embedding look at data_helper.py
53+ """
54+
55+ model = Sequential ()
56+ embedding_matrix = np .random .random ((len (word_index ) + 1 , EMBEDDING_DIM ))
57+ for word , i in word_index .items ():
58+ embedding_vector = embeddings_index .get (word )
59+ if embedding_vector is not None :
60+ # words not found in embedding index will be all-zeros.
61+ if len (embedding_matrix [i ]) != len (embedding_vector ):
62+ print ("could not broadcast input array from shape" ,str (len (embedding_matrix [i ])),
63+ "into shape" ,str (len (embedding_vector ))," Please make sure your"
64+ " EMBEDDING_DIM is equal to embedding_vector file ,GloVe," )
65+ exit (1 )
66+
67+ embedding_matrix [i ] = embedding_vector
68+
69+ embedding_layer = Embedding (len (word_index ) + 1 ,
70+ EMBEDDING_DIM ,
71+ weights = [embedding_matrix ],
72+ input_length = MAX_SEQUENCE_LENGTH ,
73+ trainable = True )
74+
75+ # applying a more complex convolutional approach
76+ convs = []
77+ filter_sizes = []
78+ layer = 5
79+ print ("Filter " ,layer )
80+ for fl in range (0 ,layer ):
81+ filter_sizes .append ((fl + 2 ))
82+
83+ node = 128
84+ sequence_input = Input (shape = (MAX_SEQUENCE_LENGTH ,), dtype = 'int32' )
85+ embedded_sequences = embedding_layer (sequence_input )
86+
87+ for fsz in filter_sizes :
88+ l_conv = Conv1D (node , kernel_size = fsz , activation = 'relu' )(embedded_sequences )
89+ l_pool = MaxPooling1D (5 )(l_conv )
90+ #l_pool = Dropout(0.25)(l_pool)
91+ convs .append (l_pool )
92+
93+ l_merge = Concatenate (axis = 1 )(convs )
94+ l_cov1 = Conv1D (node , 5 , activation = 'relu' )(l_merge )
95+ l_cov1 = Dropout (dropout )(l_cov1 )
96+ l_pool1 = MaxPooling1D (5 )(l_cov1 )
97+ l_cov2 = Conv1D (node , 5 , activation = 'relu' )(l_pool1 )
98+ l_cov2 = Dropout (dropout )(l_cov2 )
99+ l_pool2 = MaxPooling1D (30 )(l_cov2 )
100+ l_flat = Flatten ()(l_pool2 )
101+ l_dense = Dense (1024 , activation = 'relu' )(l_flat )
102+ l_dense = Dropout (dropout )(l_dense )
103+ l_dense = Dense (512 , activation = 'relu' )(l_dense )
104+ l_dense = Dropout (dropout )(l_dense )
105+ preds = Dense (nclasses , activation = 'softmax' )(l_dense )
106+ model = Model (sequence_input , preds )
107+
108+ model .compile (loss = 'sparse_categorical_crossentropy' ,
109+ optimizer = 'adam' ,
110+ metrics = ['accuracy' ])
111+
112+
113+
114+ return model
115+
116+
117+
118+
119+
120+
121+ newsgroups_train = fetch_20newsgroups (subset = 'train' )
122+ newsgroups_test = fetch_20newsgroups (subset = 'test' )
123+ X_train = newsgroups_train .data
124+ X_test = newsgroups_test .data
125+ y_train = newsgroups_train .target
126+ y_test = newsgroups_test .target
127+
128+ X_train_Glove ,X_test_Glove , word_index ,embeddings_index = loadData_Tokenizer (X_train ,X_test )
129+
130+
131+ model_CNN = Build_Model_CNN_Text (word_index ,embeddings_index , 20 )
132+
133+
134+ model_CNN .summary ()
135+
136+ model_CNN .fit (X_train_Glove , y_train ,
137+ validation_data = (X_test_Glove , y_test ),
138+ epochs = 15 ,
139+ batch_size = 128 ,
140+ verbose = 2 )
141+
142+ predicted = model_CNN .predict (X_test_Glove )
143+
144+ predicted = np .argmax (predicted , axis = 1 )
145+
146+
147+ print (metrics .classification_report (y_test , predicted ))
0 commit comments