Pytorch and Keras different behaviour

Question

I've implemented VGG16 from scratch on both Keras and Pytorch to make sure that the implementation is exactly the same. I'm training it on CIFAR10 and the results are too different.

Here's my implementation in keras:

(x_train,y_train),(x_test,y_test)=cifar10.load_data()
x_train = x_train / 255
x_test = x_test / 255

y_train=np_utils.to_categorical(y_train,10)
y_test=np_utils.to_categorical(y_test,10)

model = Sequential()
model.add(Conv2D(input_shape=(32,32,3),filters=64,kernel_size=(3,3),padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=64,kernel_size=(3,3),padding="same"))
model.add(ReLU())
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))

model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))

model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))

model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))

model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same"))
model.add(ReLU())
model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same"))
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))

model.add(Flatten())

model.add(Dense(units=4096,activation="relu"))
model.add(Dense(units=4096,activation="relu"))
model.add(Dense(units=10, activation="softmax"))

opt = Adam(lr=0.0001, epsilon=1e-06)
model.compile(optimizer=opt, loss=keras.losses.categorical_crossentropy, metrics=['accuracy'])

hist = model.fit(x_train, y_train,
    epochs=50,
    batch_size=100,
    validation_data=(x_test, y_test)
    )

And the same in pytorch:

num_epochs = 50
num_classes = 10
batch_size = 100
learning_rate = 0.0001

trans = transforms.ToTensor()
train_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=True, download=True, transform=trans)
test_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=False, download=True, transform=trans)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer5 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc1 = nn.Sequential(
            nn.Linear(512, 4096),
            nn.ReLU())
        self.fc2 = nn.Sequential(
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fcout = nn.Sequential(
            nn.Linear(4096, 10),
            nn.Softmax())

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.fcout(out)
        return out

def weights_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight.data)
        nn.init.zeros_(m.bias.data)

model = ConvNet()
model.apply(weights_init)

device = torch.device("cuda:0")#"cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=1e-6)

# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    total = 0
    correct = 0
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # Run the forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backprop and perform Adam optimisation
        loss.backward()
        optimizer.step()

        # Track the accuracy
        total += labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()

    print("Train")
    print('Epoch [{}/{}], Accuracy: {:.2f}%'
          .format(epoch + 1, num_epochs, (correct / total) * 100))


    total_test = 0
    correct_test = 0
    for i, (images, labels) in enumerate(test_loader):
        images = images.to(device)
        labels = labels.to(device)

        # Run the forward pass
        outputs = model(images)

        # Track the accuracy
        total_test += labels.size(0)
        _, predicted = torch.max(outputs.data, 1)
        correct_test += (predicted == labels).sum().item()
    print("Test")
    print('Epoch [{}/{}], Accuracy: {:.2f}%'
          .format(epoch + 1, num_epochs, (correct_test / total_test) * 100))

As you can see, I've adjusted it to use the same initializers in the network and parameters in the optimizer. However, the results are too different, as you can see in the following image.

Here's another execution of these same networks

And I've tried also adding some batch normalization after every layer (between layer and activation). It kind of solves the problem, but not completely. Here's the plot:

Is there any reasonable explanation to this behavior or have I made any mistake in the code?

Just by looking at the plot it seems like the learning rate of the pytorch model is too high. It could be that the adam implementations of keras and torch differ from each other (so that the same nominal learning rate is an order of magnitude off), or that weight initialization differed greatly. Why don't you run repeated experiments and plot the average accuracy over epochs? — Florian Drawitsch, Sep 08 '20 at 12:18
I thought about it but the learning rate is already quite low isn't it? I guess you may be right about adam implementation, or there may be some numerical instability in pytorch implementation. I wanted to make sure that there were no mistakes in my code, do you see any mistake or difference between the implementations? — Josemafuen, Sep 08 '20 at 13:32
In PyTorch, you should [**not** use softmax together with CrossEntropyLoss](https://stackoverflow.com/a/55675428/4228275). — Berriel, Sep 08 '20 at 13:33
Beriel is right, if you compute softmax after the output layer, you should use `torch.nn.NLLLoss` instead. Remove the softmax if you want to use CrossEntropyLoss. — Florian Drawitsch, Sep 08 '20 at 14:37
Yes, that's the correct answer, please put it as an answer so I can mark it as correct. — Josemafuen, Sep 08 '20 at 16:26

Pytorch and Keras different behaviour

0 Answers0