Caffe - num_output in prototxt gives strange behaviour

Question

I am doing some experiments where I split the Cifar-10 dataset into two halves, such that each half contains five random classes. I trained on one half with the bvlc_alexnet architecture. Therefore, I changed num_output to 5 and made some other small tweaks to the network. When I inspect the logfile, I find that the loss increases to around 80, with a test accuracy of 0.

However, when I change num_output to 10, training seems to go normal i.e. loss steadily decreases, and results in a test accuracy of about 70%.

How can this be explained?

train_val.prototxt

name: "AlexNet"
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  transform_param {
    mirror: true
    crop_size: 25

  }
  data_param {
    source: "/home/apples/caffe/cifar/cifarA/cifar_A_train_lmdb"
    batch_size: 256
    backend: LMDB
  }
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }
  transform_param {
    mirror: false
    crop_size: 25

  }
  data_param {
    source: "/home/apples/caffe/cifar/cifarA/cifar_A_val_lmdb"
    batch_size: 100
    backend: LMDB
  }
}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
}
layer {
  name: "norm1"
  type: "LRN"
  bottom: "conv1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "norm1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
}
layer {
  name: "norm2"
  type: "LRN"
  bottom: "conv2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "norm2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv3"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3"
  top: "conv3"
}
layer {
  name: "conv4"
  type: "Convolution"
  bottom: "conv3"
  top: "conv4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu4"
  type: "ReLU"
  bottom: "conv4"
  top: "conv4"
}
layer {
  name: "conv5"
  type: "Convolution"
  bottom: "conv4"
  top: "conv5"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu5"
  type: "ReLU"
  bottom: "conv5"
  top: "conv5"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "fc6"
  type: "InnerProduct"
  bottom: "pool5"
  top: "fc6"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc7"
  type: "InnerProduct"
  bottom: "fc6"
  top: "fc7"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
}
layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc8_mnist"
  type: "InnerProduct"
  bottom: "fc7"
  top: "fc8_mnist"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 5
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "fc8_mnist"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "fc8_mnist"
  bottom: "label"
  top: "loss"
}

This split contains the classes 0, 4, 5, 6 and 8. I used the create_imagenet.sh script to create the lmdb files.

Sample of the train.txt

0/attack_aircraft_s_001759.png 0
0/propeller_plane_s_001689.png 0
4/fallow_deer_s_000021.png 4
4/alces_alces_s_000686.png 4
5/toy_spaniel_s_000327.png 5
5/toy_spaniel_s_000511.png 5
6/bufo_viridis_s_000502.png 6
6/bufo_viridis_s_001005.png 6
8/passenger_ship_s_000236.png 8
8/passenger_ship_s_000853.png 8

Sample of the val.txt

0/attack_aircraft_s_000002.png 0
0/propeller_plane_s_000006.png 0
4/fallow_deer_s_000001.png 4
4/alces_alces_s_000012.png 4
5/toy_spaniel_s_000020.png 5
6/bufo_viridis_s_000016.png 6
8/passenger_ship_s_000060.png 8

It's unclear based on your description, but I presume the training partition and the validation partition contain the same 5 of 10 classes, correct? If your partition logic was bad and you have samples for classes in validation (test) from classes that are not seen in training, you'd get a low test error. Test error would be 100% (0 accuracy) if you did a class-wise split between your partitions. — svohara, Mar 31 '16 at 23:22
Thanks for your response. Correct, training and validation contain the same classes. The only thing I changed was the `num_output` from 5 to 10. Could it be a bug in Caffe? — apples-oranges, Mar 31 '16 at 23:27
What is the accuracy when you use the training test as the test imageset too. — Anoop K. Prabhu, Apr 01 '16 at 06:26
for the split of labels 5-9, did you changed the labels to 0-4, or have you left the labels unchanged? — Shai, Apr 01 '16 at 07:00
@AnoopK.Prabhu When I use the training data as test data, Caffe seems to freeze at this step `I0401 13:03:42.787312 24045 net.cpp:411] data -> label` (I did it with both `num_output` 5 and 10.) — apples-oranges, Apr 01 '16 at 11:08
@Shai I left the labels unchanged. I edited my post, you can see how my train and val text files look like for this particular split. — apples-oranges, Apr 01 '16 at 11:09
It seems like something is not right in the data labels. It could be that the script creating the lmdb files or Caffe are expecting sequential data. You might try labeling sequentially as Shai suggests. This is just speculation but I would try it if ti's not too difficult. — cgarner, Apr 01 '16 at 12:46
@Shai I changed the labels to sequential order, and then `num_output: 5` seems to work correctly. However, `num_output: 10` *also* works in this case. This still shouldn't be the case right? — apples-oranges, Apr 01 '16 at 16:55

score 4 · Accepted Answer · answered Apr 01 '16 at 17:17

As was pointed out in the comments, Caffe expects that the labels are integers between 0 and num_classes - 1. In your case, as you set number of labels to 5, Caffe will create five output neurons in the last layer. When you ask it to predict class 6 or 8, you are asking it to maximize the output of a non-existent neuron, which Caffe obviously cannot do.

Now, when you relabel your data, and set num_classes to 5, you do the correct thing, and it, therefore, works. When you set num_classes to 10, the network will still work, because now it has 10 output neurons, which is enough to be able to classify five classes. It will learn that classes from 5 to 9 are never present, and therefore should never be predicted, and it will just adjust the weights in a way that will always result in very small values returned by those output neurons. It is important to note, however, that neural networks are naturally random, so it might still occasionally return a class that was never presented to it, so I would expect a NN with num_classes larger than the actual number of classes to perform worse than the one with the correct num_classes.

This was extremely helpful, thank you! Is this also described in the documentation? — apples-oranges, Apr 01 '16 at 17:44

Caffe - num_output in prototxt gives strange behaviour

1 Answers1