I'm trying to implement a mnist classifier with DNN. However, the result I got is quite strange. enter image description here
In this epoch, this model can only predict number '0' correctly, and incorrect prediction for all the other numbers. This model could only predict a specific number for each epoch. (such predicted number is different in each epoch)
This is how I get the dataset.
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import time
x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
x = (x/255.).astype('float32')
y = to_categorical(y)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=42)
For this part, this is my model. A two-hidden-layers DNN with activation functions of Relu and softmax, Cross entropy loss for the error function. I'm not really sure if my backpropagation is correct or not. I think something is wrong here.
import numpy as np
class NN():
def __init__(self, input_size, hidden_1_size, hidden_2_size, output_size):
self.input_data = np.random.randn(1, input_size)
self.w1 = np.random.randn(input_size, hidden_1_size)
self.b1 = np.random.randn(1, hidden_1_size)
self.w2 = np.random.randn(hidden_1_size, hidden_2_size)
self.b2 = np.random.randn(1, hidden_2_size)
self.w3 = np.random.randn(hidden_2_size, output_size)
self.b3 = np.random.randn(1, output_size)
def Sigmoid(self, z):
return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-7))
def Softmax(self, z):
y_logit = np.exp(z - np.max(z, 1, keepdims=True))
y = y_logit / np.sum(y_logit, 1, keepdims=True)
return y
def Relu(self, z):
return np.maximum(z, 0)
def acc_test(self, input_data):
tmp_h1 = self.Relu(input_data.dot(self.w1) + self.b1)
tmp_h2 = self.Relu(self.h1_out.dot(self.w2) + self.b2)
tmp_out = self.Softmax(self.h2_out.dot(self.w3) + self.b3)
return tmp_out
# Feed Placeholder
def forward(self, input_data):
self.input_data = input_data
self.h1_out = self.Relu(input_data.dot(self.w1) + self.b1)
self.h2_out = self.Relu(self.h1_out.dot(self.w2) + self.b2)
self.output_layer = self.Softmax(self.h2_out.dot(self.w3) + self.b3)
# Backward Propagation
def backward(self, target):
# corss_entropy loss derivative
Loss_to_z_grad = (self.output_layer - target) # correct
self.b3_grad = Loss_to_z_grad
self.w3_grad = self.h2_out.T.dot(Loss_to_z_grad) # correct
Activation_2_grad = Loss_to_z_grad.dot(self.w3.T) # correct
Activation_2_grad[Activation_2_grad<0] = 0
self.b2_grad = Activation_2_grad
self.w2_grad = self.h1_out.T.dot(Activation_2_grad)
Activation_1_grad = Activation_2_grad.dot(self.w2.T)
Activation_1_grad[Activation_1_grad<0] = 0
self.b1_grad = Activation_1_grad
self.w1_grad = self.input_data.T.dot(Activation_1_grad)
# Update Weights
def update(self, learning_rate=1e-06):
self.w1 = self.w1 - learning_rate * self.w1_grad
self.b1 = self.b1 - learning_rate * self.b1_grad
self.w2 = self.w2 - learning_rate * self.w2_grad
self.b2 = self.b2 - learning_rate * self.b2_grad
self.w3 = self.w3 - learning_rate * self.w3_grad
self.b3 = self.b3 - learning_rate * self.b3_grad
# Loss Functions
def cross_entropy(Y, Y_prediction):
return -(np.matmul(Y, np.log(Y_prediction)) + np.matmul((1-Y), np.log(1-Y_prediction)))
def print_accuracy(self):
correct = 0
loss = 0
for i in range(y_val.shape[0]):
self.acc_test(x_val[i])
index = self.output_layer
one_hot = 0
for check in range(y_val[i].shape[0]):
if y_val[i][check] == 1:
one_hot = check
break
if np.argmax(index) == one_hot:
correct += 1
# print('correct: ',check)
# else:
# print('incorrect: ', check)
print('accuracy = ', correct/y_val.shape[0])
import random
mnist_nn = NN(input_size = 784, hidden_1_size = 200, hidden_2_size = 200,output_size = 10)
for i in range(1000):
for j in range(2000):
index = random.randint(0,x_train.shape[0]-1)
mnist_nn.forward(x_train[[index]])
mnist_nn.backward(y_train[index])
mnist_nn.update()
print(i)
mnist_nn.print_accuracy()
The accuracy is terribly low since it can only predict one number. I've seen this article, Neural network always predicts the same class and I did change Relu to leaky Relu, but it doesn't really work.
I think my dataset should be ok cause I use the same dataset to train a DNN with pytorch, and it works. Also, the initial value of weights and bias are random values.