I am following the D2L book and one of the exercises on the GoogLeNet chapter is to take the model from the Rethinking the Inception Architecture for Computer Vision paper so I did that and basically recreated the Inception V2 architecture based on the Inception V1 architecture presented in the D2L book, the only thing I didn't include was the grid reduction described in the paper. I expected this to improve results on my models but if anything it made them worse. I was testing the models using the D2L code presented in the GoogLeNet chapter on the fashion_mnist dataset and got a test accuracy of 0.908 using Inception V1 however on my V2 implementation from the paper I only got test acc of 0.885 and that was the highest I could get. I expected the V2 model to be better so I don't really know where I went wrong, if I made an error in my code or what but any insights into the V2 architecture or from the paper would be greatly appreciated.
This is the code for the first 3 inception blocks (from figure 5 in the paper)
class InceptionB1(nn.Module):
# `c1`--`c4` are the number of output channels for each path
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super(InceptionB1, self).__init__(**kwargs)
# Path 1 is a single 1 x 1 convolutional layer
self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
# Path 2 is a 1 x 1 convolutional layer followed by a 3 x 3
# convolutional layer
self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
# Path 3 is a 1 x 1 convolutional layer followed by 2 3x3 convs (factorization)
# convolutional layer
self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=3, padding=1)
# Replacing path3_2 with modified path using factorization through smaller convolutions
self.p3_3 = nn.Conv2d(c3[1], c3[1], kernel_size=3, padding = 1)
# Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
# convolutional layer
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
p3 = F.relu(self.p3_3(self.p3_2(F.relu(self.p3_1(x)))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
# Concatenate the outputs on the channel dimension
return torch.cat((p1, p2, p3, p4), dim=1)
This is my code for the middle 5 blocks (figure 6 in the paper)
class InceptionB2(nn.Module):
# `c1`--`c4` are the number of output channels for each path
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super(InceptionB2, self).__init__(**kwargs)
# Path 1 is a single 1 x 1 convolutional layer
self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
# Path 2 is a 1 x 1 convolutional layer followed by a 1 x 3 then 3x1 conv
# convolutional layer
self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=(1,3), padding=1)
self.p2_3 = nn.Conv2d(c2[1], c2[1], kernel_size=(3,1), padding=0)
# Path 3 is a 1 x 1 conv layer followed by a 1x3 conv then 3x1 then 1x3 then 3x1
# convolutional layer
self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=(1,3), padding=1)
self.p3_3 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=0)
self.p3_4 = nn.Conv2d(c3[1], c3[1], kernel_size=(1,3), padding=0)
self.p3_5 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=1)
# Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
# convolutional layer
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_3(F.relu(self.p2_2(F.relu(self.p2_1(x))))))
p3 = F.relu(self.p3_5(F.relu(self.p3_4(F.relu(self.p3_3(F.relu(self.p3_2(F.relu(self.p3_1(x))))))))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
# Concatenate the outputs on the channel dimension
return torch.cat((p1, p2, p3, p4), dim=1)
This is my code for the last 2 blocks (figure 7 in the paper)
class InceptionB3(nn.Module):
# `c1`--`c4` are the number of output channels for each path
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super(InceptionB3, self).__init__(**kwargs)
# Path 1 is a single 1 x 1 convolutional layer
self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
# Path 2 is a 1 x 1 convolutional layer followed by a 1x3 and 3x1
# convolutional layer
self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
self.p2_2_1 = nn.Conv2d(c2[0], c2[1], kernel_size=(1,3), padding=0)
self.p2_2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=(3,1), padding=0)
# Path 3 is a 1 x 1 convolutional layer followed by a 3x3 then 1x3 and 3x1
# convolutional layer
self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=3, padding=1)
self.p3_3_1 = nn.Conv2d(c3[1], c3[1], kernel_size=(1,3), padding=0)
self.p3_3_2 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=0)
# Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
# convolutional layer
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2_1 = F.relu(self.p2_2_1(F.relu(self.p2_1(x))))
p2_2 = F.relu(elf.p2_2_2(F.relu(self.p2_1(x))))
p2 = torch.matmul(p2_1, p2_2)
p3_1 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
p3_2_1 = F.relu(self.p3_3_1(p3_1))
p3_2_2 = F.relu(sself.p3_3_2(p3_1))
p3 = torch.matmul(p3_2_1, p3_2_2)
p4 = F.relu(self.p4_2(self.p4_1(x)))
# Concatenate the outputs on the channel dimension
return torch.cat((p1, p2, p3, p4), dim=1)
Here is the link to the D2L chapter https://d2l.ai/chapter_convolutional-modern/googlenet.html