Skip to content

Commit 9f40d7d

Browse files
authored
Add files via upload
1 parent 6a4f607 commit 9f40d7d

9 files changed

Lines changed: 477 additions & 0 deletions

File tree

code/CNN.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
from keras.layers import Dropout, Dense,Input,Embedding,Flatten, MaxPooling1D, Conv1D
2+
from keras.models import Sequential,Model
3+
from sklearn.feature_extraction.text import TfidfVectorizer
4+
import numpy as np
5+
from sklearn import metrics
6+
from keras.preprocessing.text import Tokenizer
7+
from keras.preprocessing.sequence import pad_sequences
8+
from sklearn.datasets import fetch_20newsgroups
9+
from keras.layers.merge import Concatenate
10+
11+
12+
def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
13+
np.random.seed(7)
14+
text = np.concatenate((X_train, X_test), axis=0)
15+
text = np.array(text)
16+
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
17+
tokenizer.fit_on_texts(text)
18+
sequences = tokenizer.texts_to_sequences(text)
19+
word_index = tokenizer.word_index
20+
text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
21+
print('Found %s unique tokens.' % len(word_index))
22+
indices = np.arange(text.shape[0])
23+
# np.random.shuffle(indices)
24+
text = text[indices]
25+
print(text.shape)
26+
X_train = text[0:len(X_train), ]
27+
X_test = text[len(X_train):, ]
28+
embeddings_index = {}
29+
f = open("C:\\Users\\kamran\\Documents\\GitHub\\RMDL\\Examples\\Glove\\glove.6B.50d.txt", encoding="utf8")
30+
for line in f:
31+
values = line.split()
32+
word = values[0]
33+
try:
34+
coefs = np.asarray(values[1:], dtype='float32')
35+
except:
36+
pass
37+
embeddings_index[word] = coefs
38+
f.close()
39+
print('Total %s word vectors.' % len(embeddings_index))
40+
return (X_train, X_test, word_index,embeddings_index)
41+
42+
43+
44+
def Build_Model_CNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
45+
46+
"""
47+
def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
48+
word_index in word index ,
49+
embeddings_index is embeddings index, look at data_helper.py
50+
nClasses is number of classes,
51+
MAX_SEQUENCE_LENGTH is maximum lenght of text sequences,
52+
EMBEDDING_DIM is an int value for dimention of word embedding look at data_helper.py
53+
"""
54+
55+
model = Sequential()
56+
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
57+
for word, i in word_index.items():
58+
embedding_vector = embeddings_index.get(word)
59+
if embedding_vector is not None:
60+
# words not found in embedding index will be all-zeros.
61+
if len(embedding_matrix[i]) !=len(embedding_vector):
62+
print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
63+
"into shape",str(len(embedding_vector))," Please make sure your"
64+
" EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
65+
exit(1)
66+
67+
embedding_matrix[i] = embedding_vector
68+
69+
embedding_layer = Embedding(len(word_index) + 1,
70+
EMBEDDING_DIM,
71+
weights=[embedding_matrix],
72+
input_length=MAX_SEQUENCE_LENGTH,
73+
trainable=True)
74+
75+
# applying a more complex convolutional approach
76+
convs = []
77+
filter_sizes = []
78+
layer = 5
79+
print("Filter ",layer)
80+
for fl in range(0,layer):
81+
filter_sizes.append((fl+2))
82+
83+
node = 128
84+
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
85+
embedded_sequences = embedding_layer(sequence_input)
86+
87+
for fsz in filter_sizes:
88+
l_conv = Conv1D(node, kernel_size=fsz, activation='relu')(embedded_sequences)
89+
l_pool = MaxPooling1D(5)(l_conv)
90+
#l_pool = Dropout(0.25)(l_pool)
91+
convs.append(l_pool)
92+
93+
l_merge = Concatenate(axis=1)(convs)
94+
l_cov1 = Conv1D(node, 5, activation='relu')(l_merge)
95+
l_cov1 = Dropout(dropout)(l_cov1)
96+
l_pool1 = MaxPooling1D(5)(l_cov1)
97+
l_cov2 = Conv1D(node, 5, activation='relu')(l_pool1)
98+
l_cov2 = Dropout(dropout)(l_cov2)
99+
l_pool2 = MaxPooling1D(30)(l_cov2)
100+
l_flat = Flatten()(l_pool2)
101+
l_dense = Dense(1024, activation='relu')(l_flat)
102+
l_dense = Dropout(dropout)(l_dense)
103+
l_dense = Dense(512, activation='relu')(l_dense)
104+
l_dense = Dropout(dropout)(l_dense)
105+
preds = Dense(nclasses, activation='softmax')(l_dense)
106+
model = Model(sequence_input, preds)
107+
108+
model.compile(loss='sparse_categorical_crossentropy',
109+
optimizer='adam',
110+
metrics=['accuracy'])
111+
112+
113+
114+
return model
115+
116+
117+
118+
119+
120+
121+
newsgroups_train = fetch_20newsgroups(subset='train')
122+
newsgroups_test = fetch_20newsgroups(subset='test')
123+
X_train = newsgroups_train.data
124+
X_test = newsgroups_test.data
125+
y_train = newsgroups_train.target
126+
y_test = newsgroups_test.target
127+
128+
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)
129+
130+
131+
model_CNN = Build_Model_CNN_Text(word_index,embeddings_index, 20)
132+
133+
134+
model_CNN.summary()
135+
136+
model_CNN.fit(X_train_Glove, y_train,
137+
validation_data=(X_test_Glove, y_test),
138+
epochs=15,
139+
batch_size=128,
140+
verbose=2)
141+
142+
predicted = model_CNN.predict(X_test_Glove)
143+
144+
predicted = np.argmax(predicted, axis=1)
145+
146+
147+
print(metrics.classification_report(y_test, predicted))

code/DNN.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from keras.layers import Dropout, Dense
2+
from keras.models import Sequential
3+
from sklearn.feature_extraction.text import TfidfVectorizer
4+
import numpy as np
5+
from sklearn import metrics
6+
7+
8+
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
9+
vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
10+
X_train = vectorizer_x.fit_transform(X_train).toarray()
11+
X_test = vectorizer_x.transform(X_test).toarray()
12+
print("tf-idf with",str(np.array(X_train).shape[1]),"features")
13+
return (X_train,X_test)
14+
15+
16+
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
17+
"""
18+
buildModel_DNN_Tex(shape, nClasses,dropout)
19+
Build Deep neural networks Model for text classification
20+
Shape is input feature space
21+
nClasses is number of classes
22+
"""
23+
model = Sequential()
24+
node = 512 # number of nodes
25+
nLayers = 4 # number of hidden layer
26+
27+
model.add(Dense(node,input_dim=shape,activation='relu'))
28+
model.add(Dropout(dropout))
29+
for i in range(0,nLayers):
30+
model.add(Dense(node,input_dim=node,activation='relu'))
31+
model.add(Dropout(dropout))
32+
model.add(Dense(nClasses, activation='softmax'))
33+
34+
model.compile(loss='sparse_categorical_crossentropy',
35+
optimizer='adam',
36+
metrics=['accuracy'])
37+
38+
return model
39+
40+
41+
from sklearn.datasets import fetch_20newsgroups
42+
43+
newsgroups_train = fetch_20newsgroups(subset='train')
44+
newsgroups_test = fetch_20newsgroups(subset='test')
45+
X_train = newsgroups_train.data
46+
X_test = newsgroups_test.data
47+
y_train = newsgroups_train.target
48+
y_test = newsgroups_test.target
49+
50+
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)
51+
52+
53+
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 20)
54+
model_DNN.summary()
55+
exit(1)
56+
model_DNN.fit(X_train_tfidf, y_train,
57+
validation_data=(X_test_tfidf, y_test),
58+
epochs=10,
59+
batch_size=128,
60+
verbose=2)
61+
62+
predicted = model_DNN.predict_classes(X_test_tfidf)
63+
64+
print(metrics.classification_report(y_test, predicted))

code/Decision_Tree.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from sklearn import tree
2+
from sklearn.pipeline import Pipeline
3+
from sklearn import metrics
4+
from sklearn.feature_extraction.text import CountVectorizer
5+
from sklearn.feature_extraction.text import TfidfTransformer
6+
from sklearn.datasets import fetch_20newsgroups
7+
8+
newsgroups_train = fetch_20newsgroups(subset='train')
9+
newsgroups_test = fetch_20newsgroups(subset='test')
10+
X_train = newsgroups_train.data
11+
X_test = newsgroups_test.data
12+
y_train = newsgroups_train.target
13+
y_test = newsgroups_test.target
14+
15+
text_clf = Pipeline([('vect', CountVectorizer()),
16+
('tfidf', TfidfTransformer()),
17+
('clf', tree.DecisionTreeClassifier()),
18+
])
19+
20+
text_clf.fit(X_train, y_train)
21+
22+
23+
predicted = text_clf.predict(X_test)
24+
25+
print(metrics.classification_report(y_test, predicted))

code/K-nearest_Neighbor.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from sklearn.neighbors import KNeighborsClassifier
2+
from sklearn.pipeline import Pipeline
3+
from sklearn import metrics
4+
from sklearn.feature_extraction.text import CountVectorizer
5+
from sklearn.feature_extraction.text import TfidfTransformer
6+
from sklearn.datasets import fetch_20newsgroups
7+
8+
newsgroups_train = fetch_20newsgroups(subset='train')
9+
newsgroups_test = fetch_20newsgroups(subset='test')
10+
X_train = newsgroups_train.data
11+
X_test = newsgroups_test.data
12+
y_train = newsgroups_train.target
13+
y_test = newsgroups_test.target
14+
15+
text_clf = Pipeline([('vect', CountVectorizer()),
16+
('tfidf', TfidfTransformer()),
17+
('clf', KNeighborsClassifier()),
18+
])
19+
20+
text_clf.fit(X_train, y_train)
21+
22+
23+
predicted = text_clf.predict(X_test)
24+
25+
print(metrics.classification_report(y_test, predicted))

code/MultinomialNB.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from sklearn.naive_bayes import MultinomialNB
2+
from sklearn.pipeline import Pipeline
3+
from sklearn import metrics
4+
from sklearn.feature_extraction.text import CountVectorizer
5+
from sklearn.feature_extraction.text import TfidfTransformer
6+
from sklearn.datasets import fetch_20newsgroups
7+
8+
newsgroups_train = fetch_20newsgroups(subset='train')
9+
newsgroups_test = fetch_20newsgroups(subset='test')
10+
X_train = newsgroups_train.data
11+
X_test = newsgroups_test.data
12+
y_train = newsgroups_train.target
13+
y_test = newsgroups_test.target
14+
15+
text_clf = Pipeline([('vect', CountVectorizer()),
16+
('tfidf', TfidfTransformer()),
17+
('clf', MultinomialNB()),
18+
])
19+
20+
text_clf.fit(X_train, y_train)
21+
22+
23+
predicted = text_clf.predict(X_test)
24+
25+
print(metrics.classification_report(y_test, predicted))

0 commit comments

Comments
 (0)