-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathKaggle_Dataset_Tabular_Data
More file actions
207 lines (153 loc) · 7.73 KB
/
Kaggle_Dataset_Tabular_Data
File metadata and controls
207 lines (153 loc) · 7.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!pip install opendatasets --quiet #Used for Kaggle Datasets
#!pip install torchsummary --quiet
#!pip install scikit-learn --quiet
import opendatasets as od
od.download('https://www.kaggle.com/datasets/mssmartypants/rice-type-classification/croissant/download')
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
data_df = pd.read_csv("rice-type-classification/riceClassification.csv")
data_df.dropna(inplace=True)
data_df = data_df.drop("id", axis=1)
data_df.head(5)
print(data_df.shape)
print(data_df["Class"].unique())
print(data_df["Class"].value_counts())
original_df = data_df.copy()
#Data Normalization / All values divided by the largest value in each [column]
for column in data_df.columns:
data_df[column] = data_df[column]/data_df[column].abs().max()
data_df.head(5)
#Split the Dataset
X = np.array(data_df.iloc[:,:-1]) #X : all columns but the last one
Y = np.array(data_df.iloc[:,-1]) #Y : is Only the last column
X.shape, Y.shape
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3)
print(f"X train:{X_train.shape}, X test:{X_test.shape}, y train:{y_train.shape}, y test:{y_test.shape}")
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)
print(f"X test:{X_test.shape}, X val:{X_val.shape}, y test:{y_test.shape}, y val:{y_val.shape}")
#Convert to Pytorch Dataset Object
class dataset(Dataset):
def __init__(self, X, Y):
self.X = torch.tensor(X, dtype=torch.float32).to(device) #Converting X to torch tensor
self.Y = torch.tensor(Y, dtype=torch.float32).to(device) #Converting Y to torch tensor
def __len__(self):
return len(self.X) #returns the shape of input
def __getitem__(self, index): #item is a row in the list
return self.X[index], self.Y[index]
training_data = dataset(X_train, y_train)
validation_data = dataset(X_val, y_val)
testing_data = dataset(X_test, y_test)
#Dataloaders to loop forward though batches of 8 x and 8 y
train_dataloader = DataLoader(training_data, batch_size = 32, shuffle = True)
validation_dataloader = DataLoader(validation_data, batch_size = 32, shuffle = True)
testing_dataloader = DataLoader(testing_data, batch_size = 32, shuffle = True)
#With 10 neurons, the network has 10 "opportunities" or "dimensions" in this hidden space
#to represent and transform the input data in a way that helps the output layer make a good prediction.
#intermediate hidden dense layer between input and the final output
#The number "10" is a hyperparameter – a design choice.
#For two subsequent nn.Linear layers where
#the output of the first layer is directly fed as input to the second layer,
#the out_features of the first nn.Linear layer must match the in_features of the second nn.Linear layer.
HIDDEN_NEURONS = 10
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.input_layer = nn.Linear(X.shape[1], HIDDEN_NEURONS) #nn.Linear(in_features, out_features)
self.linear = nn.Linear(HIDDEN_NEURONS,1 )#nn.Linear(in_features 10, out_features 10)
self.sigmoid = nn.Sigmoid()
#Below define how the data flows inside the model - we create the flow of the data
def forward(self, x):
x = self.input_layer(x)
x = self.linear(x)
x = self.sigmoid(x)
return x
model = MyModel().to(device)
summary(model, (X.shape[1],))
criterion = nn.BCELoss()
#Creates a criterion that measures the Binary Cross Entropy
#between the target and the input probabilities
#This nn.BCELoss scalar loss value is what you use to perform backpropagation and update your model's weights.
#A lower loss value is better
optimizer = Adam(model.parameters(), lr = 1e-3)
total_loss_train_plot = [] #
total_loss_validation_plot = []
total_accuracy_train_plot = []
total_accuracy_validation_plot = []
epochs = 10
for epoch in range(epochs):
total_accuracy_train = 0
total_loss_train = 0
for data in train_dataloader:
inputs, labels = data # 8 full rows, 8 labels
prediction = model(inputs).squeeze(1) #sigmoid output
batch_loss = criterion(prediction, labels) ##sigmoid output(probability) and label
total_loss_train += batch_loss.item() #item to get only the number-we keep adding each batch of 8 rows
accuracy = ((prediction).round() == labels).sum().item() #prediction).round() sigmoid to 1 or 0
#print(prediction)
#print(labels)
#print(prediction.round())
#print(accuracy)
total_accuracy_train += accuracy
#Here we finished the forward propagation
batch_loss.backward() #we go from the end of the model backwards
optimizer.step() #Optimizer take a step and change the weights
optimizer.zero_grad()
#Done with the training
#Next step is the validation - same steps as above
total_accuracy_validation = 0
total_loss_validation = 0
with torch.no_grad(): #Using the model for inference not training so no_grad
for data in validation_dataloader:
inputs, labels = data # 8 full rows, 8 labels
prediction = model(inputs).squeeze(1) #sigmoid output
batch_loss = criterion(prediction, labels) ##sigmoid output(probability) and label
total_loss_validation += batch_loss.item() #item to get only the number-we keep adding each batch of 8 rows
accuracy = ((prediction).round() == labels).sum().item() #prediction).round() sigmoid to 1 or 0
total_accuracy_validation += accuracy
total_loss_train_plot.append(round(total_loss_train/len(train_dataloader), 4))
total_loss_validation_plot.append(round(total_loss_validation/len(validation_dataloader), 4))
total_accuracy_train_plot.append(round(total_accuracy_train/training_data.__len__() * 100, 4))
total_accuracy_validation_plot.append(round(total_accuracy_validation/validation_data.__len__() * 100, 4))
print(f"Epoch no. {epoch+1}")
print(f"Train Loss: {round(total_loss_train/len(train_dataloader), 4)} ")
print(f"Train Accuracy: {round(total_accuracy_train/training_data.__len__() * 100, 4)}")
print(f"Validation Loss: {round(total_loss_validation/len(validation_dataloader), 4)} ")
print(f"Validation Accuracy: {round(total_accuracy_validation/validation_data.__len__() * 100, 4)}")
print("="*35)
#Test our Model
with torch.no_grad():
total_loss_test = 0
total_accuracy_test = 0
for data in testing_dataloader: #coming in batches
inputs, labels = data
prediction = model(inputs).squeeze(1)
batch_loss_test = criterion(prediction, labels).item()
total_loss_test += batch_loss_test
accuracy = ((prediction).round() == labels).sum().item()
total_accuracy_test += accuracy
print(f" Test Accuracy:{round(total_accuracy_test/testing_data.__len__() * 100, 4)}")
fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (15,5))
axs[0].plot(total_loss_train_plot, label = 'Training Loss')
axs[0].plot(total_loss_validation_plot, label = 'Validation Loss')
axs[0].set_title("Training and Validation loss over epochs")
axs[0].set_title("Epochs")
axs[0].set_title("Loss")
axs[0].set_ylim([0,0.06])
axs[0].legend()
axs[1].plot(total_accuracy_train_plot, label = 'Training Accuracy')
axs[1].plot(total_accuracy_validation_plot, label = 'Validation Accuracy')
axs[1].set_title("Training and Validation Accuracy over epochs")
axs[1].set_title("Epochs")
axs[1].set_title("Accuracy")
axs[1].set_ylim([98,99])
axs[1].legend()