Skip to content

Commit c77014b

Browse files
authored
Add files via upload
1 parent b389b5b commit c77014b

5 files changed

Lines changed: 270 additions & 19 deletions

File tree

code/Bagging.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from sklearn.ensemble import BaggingClassifier
2+
from sklearn.neighbors import KNeighborsClassifier
3+
from sklearn.pipeline import Pipeline
4+
from sklearn import metrics
5+
from sklearn.feature_extraction.text import CountVectorizer
6+
from sklearn.feature_extraction.text import TfidfTransformer
7+
from sklearn.datasets import fetch_20newsgroups
8+
9+
newsgroups_train = fetch_20newsgroups(subset='train')
10+
newsgroups_test = fetch_20newsgroups(subset='test')
11+
X_train = newsgroups_train.data
12+
X_test = newsgroups_test.data
13+
y_train = newsgroups_train.target
14+
y_test = newsgroups_test.target
15+
16+
text_clf = Pipeline([('vect', CountVectorizer()),
17+
('tfidf', TfidfTransformer()),
18+
('clf', BaggingClassifier(KNeighborsClassifier())),
19+
])
20+
21+
text_clf.fit(X_train, y_train)
22+
23+
24+
predicted = text_clf.predict(X_test)
25+
26+
print(metrics.classification_report(y_test, predicted))

code/Boost.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from sklearn.ensemble import GradientBoostingClassifier
2+
from sklearn.pipeline import Pipeline
3+
from sklearn import metrics
4+
from sklearn.feature_extraction.text import CountVectorizer
5+
from sklearn.feature_extraction.text import TfidfTransformer
6+
from sklearn.datasets import fetch_20newsgroups
7+
8+
newsgroups_train = fetch_20newsgroups(subset='train')
9+
newsgroups_test = fetch_20newsgroups(subset='test')
10+
X_train = newsgroups_train.data
11+
X_test = newsgroups_test.data
12+
y_train = newsgroups_train.target
13+
y_test = newsgroups_test.target
14+
15+
text_clf = Pipeline([('vect', CountVectorizer()),
16+
('tfidf', TfidfTransformer()),
17+
('clf', GradientBoostingClassifier(n_estimators=50,verbose=2)),
18+
])
19+
20+
text_clf.fit(X_train, y_train)
21+
22+
23+
predicted = text_clf.predict(X_test)
24+
25+
print(metrics.classification_report(y_test, predicted))

code/CNN.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from keras.layers import Dropout, Dense,Input,Embedding,Flatten, MaxPooling1D, Conv1D
1+
from keras.layers import Dropout, Dense,Input,Embedding,Flatten, AveragePooling2D, Conv2D,Reshape
22
from keras.models import Sequential,Model
33
from sklearn.feature_extraction.text import TfidfVectorizer
44
import numpy as np
@@ -26,7 +26,7 @@ def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=50
2626
X_train = text[0:len(X_train), ]
2727
X_test = text[len(X_train):, ]
2828
embeddings_index = {}
29-
f = open("C:\\Users\\kamran\\Documents\\GitHub\\RMDL\\Examples\\Glove\\glove.6B.50d.txt", encoding="utf8")
29+
f = open("C:\\Users\\kamran\\Documents\\GitHub\\RMDL\\Examples\\Glove\\glove.6B.100d.txt", encoding="utf8")
3030
for line in f:
3131
values = line.split()
3232
word = values[0]
@@ -41,7 +41,7 @@ def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=50
4141

4242

4343

44-
def Build_Model_CNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
44+
def Build_Model_CNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=100, dropout=0.5):
4545

4646
"""
4747
def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
@@ -78,30 +78,29 @@ def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=5
7878
layer = 5
7979
print("Filter ",layer)
8080
for fl in range(0,layer):
81-
filter_sizes.append((fl+2))
81+
filter_sizes.append((fl+2,fl+2))
8282

8383
node = 128
8484
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
8585
embedded_sequences = embedding_layer(sequence_input)
86+
emb = Reshape((500,10, 10), input_shape=(500,100))(embedded_sequences)
8687

8788
for fsz in filter_sizes:
88-
l_conv = Conv1D(node, kernel_size=fsz, activation='relu')(embedded_sequences)
89-
l_pool = MaxPooling1D(5)(l_conv)
89+
l_conv = Conv2D(node, padding="same", kernel_size=fsz, activation='relu')(emb)
90+
l_pool = AveragePooling2D(pool_size=(5,1), padding="same")(l_conv)
9091
#l_pool = Dropout(0.25)(l_pool)
9192
convs.append(l_pool)
9293

9394
l_merge = Concatenate(axis=1)(convs)
94-
l_cov1 = Conv1D(node, 5, activation='relu')(l_merge)
95-
l_cov1 = Dropout(dropout)(l_cov1)
96-
l_pool1 = MaxPooling1D(5)(l_cov1)
97-
l_cov2 = Conv1D(node, 5, activation='relu')(l_pool1)
98-
l_cov2 = Dropout(dropout)(l_cov2)
99-
l_pool2 = MaxPooling1D(30)(l_cov2)
100-
l_flat = Flatten()(l_pool2)
101-
l_dense = Dense(1024, activation='relu')(l_flat)
102-
l_dense = Dropout(dropout)(l_dense)
103-
l_dense = Dense(512, activation='relu')(l_dense)
95+
l_cov1 = Conv2D(node, (5,5), padding="same", activation='relu')(l_merge)
96+
l_cov1 = AveragePooling2D(pool_size=(5,2), padding="same")(l_cov1)
97+
l_cov2 = Conv2D(node, (5,5), padding="same", activation='relu')(l_cov1)
98+
l_pool2 = AveragePooling2D(pool_size=(5,2), padding="same")(l_cov2)
99+
l_cov2 = Dropout(dropout)(l_pool2)
100+
l_flat = Flatten()(l_cov2)
101+
l_dense = Dense(128, activation='relu')(l_flat)
104102
l_dense = Dropout(dropout)(l_dense)
103+
105104
preds = Dense(nclasses, activation='softmax')(l_dense)
106105
model = Model(sequence_input, preds)
107106

@@ -115,8 +114,8 @@ def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=5
115114

116115

117116

118-
119-
117+
from sklearn.datasets import fetch_20newsgroups
118+
from RMDL import text_feature_extraction as txt
120119

121120
newsgroups_train = fetch_20newsgroups(subset='train')
122121
newsgroups_test = fetch_20newsgroups(subset='test')
@@ -125,6 +124,7 @@ def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=5
125124
y_train = newsgroups_train.target
126125
y_test = newsgroups_test.target
127126

127+
128128
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)
129129

130130

@@ -135,7 +135,7 @@ def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=5
135135

136136
model_CNN.fit(X_train_Glove, y_train,
137137
validation_data=(X_test_Glove, y_test),
138-
epochs=15,
138+
epochs=1000,
139139
batch_size=128,
140140
verbose=2)
141141

code/CRF.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import nltk
2+
import sklearn_crfsuite
3+
from sklearn_crfsuite import metrics
4+
nltk.corpus.conll2002.fileids()
5+
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
6+
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
7+
def word2features(sent, i):
8+
word = sent[i][0]
9+
postag = sent[i][1]
10+
11+
features = {
12+
'bias': 1.0,
13+
'word.lower()': word.lower(),
14+
'word[-3:]': word[-3:],
15+
'word[-2:]': word[-2:],
16+
'word.isupper()': word.isupper(),
17+
'word.istitle()': word.istitle(),
18+
'word.isdigit()': word.isdigit(),
19+
'postag': postag,
20+
'postag[:2]': postag[:2],
21+
}
22+
if i > 0:
23+
word1 = sent[i-1][0]
24+
postag1 = sent[i-1][1]
25+
features.update({
26+
'-1:word.lower()': word1.lower(),
27+
'-1:word.istitle()': word1.istitle(),
28+
'-1:word.isupper()': word1.isupper(),
29+
'-1:postag': postag1,
30+
'-1:postag[:2]': postag1[:2],
31+
})
32+
else:
33+
features['BOS'] = True
34+
35+
if i < len(sent)-1:
36+
word1 = sent[i+1][0]
37+
postag1 = sent[i+1][1]
38+
features.update({
39+
'+1:word.lower()': word1.lower(),
40+
'+1:word.istitle()': word1.istitle(),
41+
'+1:word.isupper()': word1.isupper(),
42+
'+1:postag': postag1,
43+
'+1:postag[:2]': postag1[:2],
44+
})
45+
else:
46+
features['EOS'] = True
47+
48+
return features
49+
50+
51+
def sent2features(sent):
52+
return [word2features(sent, i) for i in range(len(sent))]
53+
54+
def sent2labels(sent):
55+
return [label for token, postag, label in sent]
56+
57+
def sent2tokens(sent):
58+
return [token for token, postag, label in sent]
59+
60+
X_train = [sent2features(s) for s in train_sents]
61+
y_train = [sent2labels(s) for s in train_sents]
62+
63+
X_test = [sent2features(s) for s in test_sents]
64+
y_test = [sent2labels(s) for s in test_sents]
65+
66+
67+
68+
crf = sklearn_crfsuite.CRF(
69+
algorithm='lbfgs',
70+
c1=0.1,
71+
c2=0.1,
72+
max_iterations=100,
73+
all_possible_transitions=True
74+
)
75+
crf.fit(X_train, y_train)
76+
77+
y_pred = crf.predict(X_test)
78+
print(metrics.flat_classification_report(
79+
y_test, y_pred, digits=3
80+
))

code/RCNN.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
from keras.preprocessing import sequence
2+
from keras.models import Sequential
3+
from keras.layers import Dense, Dropout, Activation
4+
from keras.layers import Embedding
5+
from keras.layers import LSTM
6+
from keras.layers import Conv1D, MaxPooling1D
7+
from keras.datasets import imdb
8+
from sklearn.datasets import fetch_20newsgroups
9+
import numpy as np
10+
from sklearn import metrics
11+
from keras.preprocessing.text import Tokenizer
12+
from keras.preprocessing.sequence import pad_sequences
13+
14+
def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
15+
np.random.seed(7)
16+
text = np.concatenate((X_train, X_test), axis=0)
17+
text = np.array(text)
18+
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
19+
tokenizer.fit_on_texts(text)
20+
sequences = tokenizer.texts_to_sequences(text)
21+
word_index = tokenizer.word_index
22+
text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
23+
print('Found %s unique tokens.' % len(word_index))
24+
indices = np.arange(text.shape[0])
25+
# np.random.shuffle(indices)
26+
text = text[indices]
27+
print(text.shape)
28+
X_train = text[0:len(X_train), ]
29+
X_test = text[len(X_train):, ]
30+
embeddings_index = {}
31+
f = open("C:\\Users\\kamran\\Documents\\GitHub\\RMDL\\Examples\\Glove\\glove.6B.50d.txt", encoding="utf8")
32+
for line in f:
33+
values = line.split()
34+
word = values[0]
35+
try:
36+
coefs = np.asarray(values[1:], dtype='float32')
37+
except:
38+
pass
39+
embeddings_index[word] = coefs
40+
f.close()
41+
print('Total %s word vectors.' % len(embeddings_index))
42+
return (X_train, X_test, word_index,embeddings_index)
43+
44+
45+
def Build_Model_RCNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50):
46+
47+
kernel_size = 2
48+
filters = 256
49+
pool_size = 2
50+
gru_node = 256
51+
52+
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
53+
for word, i in word_index.items():
54+
embedding_vector = embeddings_index.get(word)
55+
if embedding_vector is not None:
56+
# words not found in embedding index will be all-zeros.
57+
if len(embedding_matrix[i]) !=len(embedding_vector):
58+
print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
59+
"into shape",str(len(embedding_vector))," Please make sure your"
60+
" EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
61+
exit(1)
62+
63+
embedding_matrix[i] = embedding_vector
64+
65+
66+
67+
model = Sequential()
68+
model.add(Embedding(len(word_index) + 1,
69+
EMBEDDING_DIM,
70+
weights=[embedding_matrix],
71+
input_length=MAX_SEQUENCE_LENGTH,
72+
trainable=True))
73+
model.add(Dropout(0.25))
74+
model.add(Conv1D(filters, kernel_size, activation='relu'))
75+
model.add(MaxPooling1D(pool_size=pool_size))
76+
model.add(Conv1D(filters, kernel_size, activation='relu'))
77+
model.add(MaxPooling1D(pool_size=pool_size))
78+
model.add(Conv1D(filters, kernel_size, activation='relu'))
79+
model.add(MaxPooling1D(pool_size=pool_size))
80+
model.add(Conv1D(filters, kernel_size, activation='relu'))
81+
model.add(MaxPooling1D(pool_size=pool_size))
82+
model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
83+
model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
84+
model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
85+
model.add(LSTM(gru_node, recurrent_dropout=0.2))
86+
model.add(Dense(1024,activation='relu'))
87+
model.add(Dense(nclasses))
88+
model.add(Activation('softmax'))
89+
90+
model.compile(loss='sparse_categorical_crossentropy',
91+
optimizer='adam',
92+
metrics=['accuracy'])
93+
94+
return model
95+
96+
newsgroups_train = fetch_20newsgroups(subset='train')
97+
newsgroups_test = fetch_20newsgroups(subset='test')
98+
X_train = newsgroups_train.data
99+
X_test = newsgroups_test.data
100+
y_train = newsgroups_train.target
101+
y_test = newsgroups_test.target
102+
103+
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)
104+
105+
106+
model_RCNN = Build_Model_RCNN_Text(word_index,embeddings_index, 20)
107+
108+
109+
model_RCNN.summary()
110+
111+
model_RCNN.fit(X_train_Glove, y_train,
112+
validation_data=(X_test_Glove, y_test),
113+
epochs=15,
114+
batch_size=128,
115+
verbose=2)
116+
117+
predicted = model_RCNN.predict(X_test_Glove)
118+
119+
predicted = np.argmax(predicted, axis=1)
120+
print(metrics.classification_report(y_test, predicted))

0 commit comments

Comments
 (0)