1

I am following the D2L book and one of the exercises on the GoogLeNet chapter is to take the model from the Rethinking the Inception Architecture for Computer Vision paper so I did that and basically recreated the Inception V2 architecture based on the Inception V1 architecture presented in the D2L book, the only thing I didn't include was the grid reduction described in the paper. I expected this to improve results on my models but if anything it made them worse. I was testing the models using the D2L code presented in the GoogLeNet chapter on the fashion_mnist dataset and got a test accuracy of 0.908 using Inception V1 however on my V2 implementation from the paper I only got test acc of 0.885 and that was the highest I could get. I expected the V2 model to be better so I don't really know where I went wrong, if I made an error in my code or what but any insights into the V2 architecture or from the paper would be greatly appreciated.

This is the code for the first 3 inception blocks (from figure 5 in the paper)

class InceptionB1(nn.Module):
    # `c1`--`c4` are the number of output channels for each path
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
        super(InceptionB1, self).__init__(**kwargs)
        # Path 1 is a single 1 x 1 convolutional layer
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
        # Path 2 is a 1 x 1 convolutional layer followed by a 3 x 3
        # convolutional layer
        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
        # Path 3 is a 1 x 1 convolutional layer followed by 2 3x3 convs (factorization)
        # convolutional layer
        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=3, padding=1)
        # Replacing path3_2 with modified path using factorization through smaller convolutions
        self.p3_3 = nn.Conv2d(c3[1], c3[1], kernel_size=3, padding = 1)
        # Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
        # convolutional layer
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)

    def forward(self, x):
        p1 = F.relu(self.p1_1(x))
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
        p3 = F.relu(self.p3_3(self.p3_2(F.relu(self.p3_1(x)))))
        p4 = F.relu(self.p4_2(self.p4_1(x)))
        # Concatenate the outputs on the channel dimension
        return torch.cat((p1, p2, p3, p4), dim=1)

This is my code for the middle 5 blocks (figure 6 in the paper)

class InceptionB2(nn.Module):
    # `c1`--`c4` are the number of output channels for each path
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
        super(InceptionB2, self).__init__(**kwargs)
        # Path 1 is a single 1 x 1 convolutional layer
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
        # Path 2 is a 1 x 1 convolutional layer followed by a 1 x 3 then 3x1 conv
        # convolutional layer
        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=(1,3), padding=1)
        self.p2_3 = nn.Conv2d(c2[1], c2[1], kernel_size=(3,1), padding=0)
        # Path 3 is a 1 x 1 conv layer followed by a 1x3 conv then 3x1 then 1x3 then 3x1
        # convolutional layer
        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=(1,3), padding=1)
        self.p3_3 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=0)
        self.p3_4 = nn.Conv2d(c3[1], c3[1], kernel_size=(1,3), padding=0)
        self.p3_5 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=1)
        # Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
        # convolutional layer
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)

    def forward(self, x):
        p1 = F.relu(self.p1_1(x))
        p2 = F.relu(self.p2_3(F.relu(self.p2_2(F.relu(self.p2_1(x))))))
        p3 = F.relu(self.p3_5(F.relu(self.p3_4(F.relu(self.p3_3(F.relu(self.p3_2(F.relu(self.p3_1(x))))))))))
        p4 = F.relu(self.p4_2(self.p4_1(x)))
        # Concatenate the outputs on the channel dimension
        return torch.cat((p1, p2, p3, p4), dim=1)

This is my code for the last 2 blocks (figure 7 in the paper)

class InceptionB3(nn.Module):
    # `c1`--`c4` are the number of output channels for each path
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
        super(InceptionB3, self).__init__(**kwargs)
        # Path 1 is a single 1 x 1 convolutional layer
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
        # Path 2 is a 1 x 1 convolutional layer followed by a 1x3 and 3x1
        # convolutional layer
        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
        self.p2_2_1 = nn.Conv2d(c2[0], c2[1], kernel_size=(1,3), padding=0)
        self.p2_2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=(3,1), padding=0)
        # Path 3 is a 1 x 1 convolutional layer followed by a 3x3 then 1x3 and 3x1
        # convolutional layer
        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=3, padding=1)
        self.p3_3_1 = nn.Conv2d(c3[1], c3[1], kernel_size=(1,3), padding=0)
        self.p3_3_2 = nn.Conv2d(c3[1], c3[1], kernel_size=(3,1), padding=0)
        # Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1
        # convolutional layer
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)

    def forward(self, x):
        p1 = F.relu(self.p1_1(x))

        p2_1 = F.relu(self.p2_2_1(F.relu(self.p2_1(x))))
        p2_2 = F.relu(elf.p2_2_2(F.relu(self.p2_1(x))))
        p2 = torch.matmul(p2_1, p2_2)

        p3_1 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        p3_2_1 = F.relu(self.p3_3_1(p3_1))
        p3_2_2 = F.relu(sself.p3_3_2(p3_1))
        p3 = torch.matmul(p3_2_1, p3_2_2)

        p4 = F.relu(self.p4_2(self.p4_1(x)))
        # Concatenate the outputs on the channel dimension
        return torch.cat((p1, p2, p3, p4), dim=1)

Here is the link to the D2L chapter https://d2l.ai/chapter_convolutional-modern/googlenet.html

This is the training graph for the v1 model

This is the training graph for the v2 model

Ethan
  • 11
  • 1
  • 3

0 Answers0