-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsvm.py
More file actions
78 lines (62 loc) · 2.5 KB
/
svm.py
File metadata and controls
78 lines (62 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
import re
from sklearn.linear_model import LogisticRegression
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import *
train_x = []
train_y = []
test_x = []
#load two train data files, one test data file, and formatting them
with open("data_set/train_set_x.csv","rb") as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader,None) #skip header of file
for row in reader:
text_deleteurl = re.sub(r"http\S+","", row[1])
text_deletenum=re.sub("\d+","",text_deleteurl)
l=text_deletenum.replace(" ","").lower()
train_x.append(l)
with open("data_set/train_set_y.csv","rb") as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader,None) #skip header of file
for row in reader:
train_y.append(row[1])
with open("data_set/test_set_x.csv","rb") as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader,None) #skip header of file
for row in reader:
text_deletenum=re.sub("\d+","",row[1])
l=text_deletenum.replace(" ","").lower()
test_x.append(l)
#tfidf preprocessing
vec = TfidfVectorizer(decode_error='strict',analyzer='char',min_df=0)
train_x=vec.fit_transform(train_x)
features = vec.get_feature_names()
vec2 = TfidfVectorizer(decode_error='strict',analyzer='char',min_df=0,vocabulary=features)
test_x = vec2.fit_transform(test_x)
#print(test_x)
#print("train_x is a matrix with size : ",train_x.shape[0],train_x.shape[1])
#print("train_y is an array with size: ",len(train_y))
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
#from sklearn.neighbors import KNeighborsClassifier
#classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(train_x.toarray(), train_y)
# Library function : logisticRegression
#lr_classifier = LogisticRegression(penalty='l2', C=1)
#lr_classifier.fit(train_x, train_y)
# predict on the test file
test_y_pred = classifier.predict(test_x.toarray())
test_y_pred_temp = test_y_pred.tolist()
# write the output to the output file
with open("kNN_output.csv",'wb') as output:
output.write("Id,Category")
output.write("\n")
for i in range(len(test_y_pred_temp)):
output.write(str(i))
output.write(",")
output.write(test_y_pred_temp[i])
output.write("\n")