I am following the book Grokking Deep Learning (Ch 8, code here) to build a Numpy Neural network which can classify MNIST digits with ~82% test accuracy. But when I modify the NN to work on a synthetic dataset, it goes to a specific train accuracy (depending on dimension of hidden layer, alpha) and stays there right from the start of training. Please check:
import numpy as np
import sys
from sklearn import datasets
X, y = datasets.make_classification(n_samples=10000, n_features=5, n_classes=4,
n_clusters_per_class=1, shuffle=True, random_state=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
def relu(x):
return (x >= 0) * x # returns x if x > 0
# returns 0 otherwise
def relu2deriv(output):
return output >= 0 #returns 1 for input > 0
def onehot(arr):
one_hot_labels = np.zeros((len(arr),4))
for i,l in enumerate(arr):
one_hot_labels[i][l] = 1
return one_hot_labels
y_train = onehot(y_train)
y_test = onehot(y_test)
alpha, iterations, hidden_size = (0.002, 300, 10)
weights_0_1 = 0.2*np.random.random((5, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, 4)) - 0.1
for j in range(iterations):
error, correct_cnt = (0.0,0)
for i in range(len(X_train)):
layer_0 = X_train[i:i+1]
layer_1 = relu(np.dot(layer_0,weights_0_1))
dropout_mask = np.random.randint(2, size=layer_1.shape)
layer_1 *= dropout_mask * 2
layer_2 = np.dot(layer_1,weights_1_2)
error += np.sum((y_train[i:i+1] - layer_2) ** 2)
correct_cnt += int(np.argmax(layer_2) == np.argmax(y_train[i:i+1]))
layer_2_delta = (y_train[i:i+1] - layer_2)
layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
layer_1_delta *= dropout_mask
weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
if(j%1 == 0): # can be set for any interval
test_error = 0.0
test_correct_cnt = 0
for i in range(len(X_test)):
layer_0 = X_test[i:i+1]
layer_1 = relu(np.dot(layer_0, weights_0_1))
layer_2 = np.dot(layer_1, weights_1_2)
test_error += np.sum((y_test[i:i+1] - layer_2) ** 2)
test_correct_cnt += int(np.argmax(layer_2) == np.argmax(y_test[i:i+1]))
sys.stdout.write("\n" + \
"I:" + str(j) + \
" Test-Err:" + str(test_error/ float(len(X_test)))[0:5] +\
" Test-Acc:" + str(test_correct_cnt/ float(len(X_test)))+\
" Train-Err:" + str(error/ float(len(X_train)))[0:5] +\
" Train-Acc:" + str(correct_cnt/ float(len(X_train))))
Output:
I:0 Test-Err:0.470 Test-Acc:0.812 Train-Err:0.704 Train-Acc:0.572
I:1 Test-Err:0.452 Test-Acc:0.811 Train-Err:0.574 Train-Acc:0.626625
I:2 Test-Err:0.445 Test-Acc:0.814 Train-Err:0.571 Train-Acc:0.61425
.
.
.
I:297 Test-Err:0.470 Test-Acc:0.7685 Train-Err:0.613 Train-Acc:0.6045
I:298 Test-Err:0.492 Test-Acc:0.785 Train-Err:0.612 Train-Acc:0.60525
I:299 Test-Err:0.478 Test-Acc:0.778 Train-Err:0.614 Train-Acc:0.60725
What's going on? How can this NN perform on the MNIST dataset but not on this dataset?