Siamese network output

Question

I'm trying to implement a siamese network in caffe in which it is composed of two imagenets that don't share weights. So what I am basically trying to do is give each network an image, and in the end try to find out the distance between them for similarity, below is my prototxt. So my main question is what should I set my "num_output" too? I have only 2 classes for my training, 0 for wither they are not alike, and 1 for if they are similar.

name: "Siamese_ImageNet"
layers {
  name: "data"
  type: IMAGE_DATA
  top: "data"
  top: "label"
  image_data_param {
    source: "train1.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TRAIN }
}
layers {
  name: "data"
  type: IMAGE_DATA
  top: "data"
  top: "label"
  image_data_param {
    source: "test1.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TEST }
}

layers {
  name: "data_p"
  type: IMAGE_DATA
  top: "data_p"
  top: "label_p"
  image_data_param {
    source: "train2.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TRAIN }
}
layers {
  name: "data_p"
  type: IMAGE_DATA
  top: "data_p"
  top: "label_p"
  image_data_param {
    source: "test2.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TEST }
}


layers {
  name: "conv1"
  type: CONVOLUTION
  bottom: "data"
  top: "conv1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu1"
  type: RELU
  bottom: "conv1"
  top: "conv1"
}
layers {
  name: "pool1"
  type: POOLING
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm1"
  type: LRN
  bottom: "pool1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv2"
  type: CONVOLUTION
  bottom: "norm1"
  top: "conv2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu2"
  type: RELU
  bottom: "conv2"
  top: "conv2"
}
layers {
  name: "pool2"
  type: POOLING
  bottom: "conv2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm2"
  type: LRN
  bottom: "pool2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv3"
  type: CONVOLUTION
  bottom: "norm2"
  top: "conv3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu3"
  type: RELU
  bottom: "conv3"
  top: "conv3"
}
layers {
  name: "conv4"
  type: CONVOLUTION
  bottom: "conv3"
  top: "conv4"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu4"
  type: RELU
  bottom: "conv4"
  top: "conv4"
}
layers {
  name: "conv5"
  type: CONVOLUTION
  bottom: "conv4"
  top: "conv5"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu5"
  type: RELU
  bottom: "conv5"
  top: "conv5"
}
layers {
  name: "pool5"
  type: POOLING
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "fc6"
  type: INNER_PRODUCT
  bottom: "pool5"
  top: "fc6"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu6"
  type: RELU
  bottom: "fc6"
  top: "fc6"
}
layers {
  name: "drop6"
  type: DROPOUT
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layers {
  name: "fc7"
  type: INNER_PRODUCT
  bottom: "fc6"
  top: "fc7"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu7"
  type: RELU
  bottom: "fc7"
  top: "fc7"
}
layers {
  name: "drop7"
  type: DROPOUT
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}

layers {
  name: "conv1_p"
  type: CONVOLUTION
  bottom: "data_p"
  top: "conv1_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu1_p"
  type: RELU
  bottom: "conv1_p"
  top: "conv1_p"
}
layers {
  name: "pool1_p"
  type: POOLING
  bottom: "conv1_p"
  top: "pool1_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm1_p"
  type: LRN
  bottom: "pool1_p"
  top: "norm1_p"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv2_p"
  type: CONVOLUTION
  bottom: "norm1_p"
  top: "conv2_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu2_p"
  type: RELU
  bottom: "conv2_p"
  top: "conv2_p"
}
layers {
  name: "pool2_p"
  type: POOLING
  bottom: "conv2_p"
  top: "pool2_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm2_p"
  type: LRN
  bottom: "pool2_p"
  top: "norm2_p"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv3_p"
  type: CONVOLUTION
  bottom: "norm2_p"
  top: "conv3_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu3_p"
  type: RELU
  bottom: "conv3_p"
  top: "conv3_p"
}
layers {
  name: "conv4_p"
  type: CONVOLUTION
  bottom: "conv3_p"
  top: "conv4_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu4_p"
  type: RELU
  bottom: "conv4_p"
  top: "conv4_p"
}
layers {
  name: "conv5_p"
  type: CONVOLUTION
  bottom: "conv4_p"
  top: "conv5_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu5_p"
  type: RELU
  bottom: "conv5_p"
  top: "conv5_p"
}
layers {
  name: "pool5_p"
  type: POOLING
  bottom: "conv5_p"
  top: "pool5_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "fc6_p"
  type: INNER_PRODUCT
  bottom: "pool5_p"
  top: "fc6_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu6_p"
  type: RELU
  bottom: "fc6_p"
  top: "fc6_p"
}
layers {
  name: "drop6_p"
  type: DROPOUT
  bottom: "fc6_p"
  top: "fc6_p"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layers {
  name: "fc7_p"
  type: INNER_PRODUCT
  bottom: "fc6_p"
  top: "fc7_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu7_p"
  type: RELU
  bottom: "fc7_p"
  top: "fc7_p"
}
layers {
  name: "drop7_p"
  type: DROPOUT
  bottom: "fc7_p"
  top: "fc7_p"
  dropout_param {
    dropout_ratio: 0.5
  }
}

layers {
    name: "loss"
    type: CONTRASTIVE_LOSS
    contrastive_loss_param {
        margin: 1.0
    }
    bottom: "fc7"
    bottom: "fc7_p"
    bottom: "label"
    top: "loss"
}

My training file structure: 0 is dissimilar, 1 is similar

 train1.txt:
 /aer/img1_1.jpg 0
 /aer/img1_2.jpg 1
 /aer/img1_3.jpg 1

 train2.txt:
 /tpd/img2_1.jpg 0
 /tpd/img2_2.jpg 1
 /tpd/img2_3.jpg 1

Ok, so I want to make sure is each iteration a comparison of an image from each data layer? Or are they being trained as separate classes alone? Do you know where I could read more about this type, all I can find is the shared weights type? — MasterWizard, Nov 22 '16 at 14:32
if you are using contrastive loss you might want num_output to be much higher. — Shai, Nov 22 '16 at 15:16
What `num_output` are your referring to exactly? Does your model produce any errors? — Jonathan, Nov 23 '16 at 15:29
in the fc7 and fc7_p's inner products. please check below, I think I am understanding something totally else... Should I have a constrative layer, then 2 soft max layers? — MasterWizard, Nov 23 '16 at 15:59

Dale · Accepted Answer · 2016-11-30T00:46:41.547

What should I set my "num_output"?

Before understanding how much you should set the num_output, let's explain what it means. In fact, you can view the two sides of the Simense network, data -> fc7, data_p -> fc7_p as 2 feature extractors. Each one is extracting feature e.g.fc7 and fc7_p from the images in the corresponding data layer. So num_output defines the dimension of the extracted feature vector.

During training, the ContrastiveLoss layer always tries to minimize the 2 extracted feature vectors' distance when the images the vectors represent for are similiar(label == 1) and maximize the distance when dissimiliar(label == 0). Namely, the smaller the distance of the feature vectors is, the more similar the images are.

So what's the optimal dimension of the feature vector to best contain the information indicating the similarity? Or what should you set the num_output? There may not be an exact value, and it depends on the encoding quality of the feature extractor(you may view the feature as a code of the image) and how much hard it is to recognize the similarity of the images. So basically if the network(feature extractor) is deep and it is not too hard to recognize the similarity, you can choose a relative small num_output e.g.200, because the feature may be encoded well by a larger network and be more discriminative . If it is not , you can try a larger value e.g. 500, 1000 or try a more complicated network.

If you want to try a MultinomialLogisticLoss instead of ContrastiveLoss layer, you should first fusion the 2 feature vectors fc7, fc7_p into 1 using a layer like CONCAT and then feed it into a SOFTMAX_LOSS layer, like this:

...#original layers
layers {
  name: "concat"
  type: CONCAT
  bottom: "fc7"
  bottom: "fc7_p"  
  top: "fc_concat" # concatenate fc7 and fc7_p along channel axis
}
layer {
  name: "fc_cls"
  type: INNER_PRODUCT
  bottom: "fc_concat"
  top: "fc_cls"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 2 # a binary classification problem in this case
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "accuracy"
  type: ACCURACY
  bottom: "fc_cls"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: SOFTMAX_LOSS
  bottom: "fc_cls"
  bottom: "label"
  top: "loss"
}

Update

Which is the best method to implement in order to compare similarity and use it for deploy, Constrastive Loss or SoftMax Loss?

Softmax Loss is simple and easy for deploy. But it can only give you the binary prediction, namely similar or dissimilar. The probability distribution over the 2 class(similar, dissimilar) it gives is often too hard(nonuniform), e.g. [0.9*, 0.0*], [0.0*, 0.9*],.... which in many cases will not reflect the true input similarity degree well.

While using Constrastive Loss you can get a discriminative feature vector for an image. And you can use the vector to compute a probability of similarity, as what the CVPR 2005 paper Learning a Similarity Metric Discriminatively, with Application to Face Verification did in Section 4.1.(The key point is to compute a multivariate normal density using the feature vectors generated from the images belonging to a same subject). Also you can use a threshold to control the false positive rate and the false negative rate of the model to get a ROC curve to better evaluate a model.

By the way, to dig out more CNN architectures for predicting similarity, you can refer to the CVPR 2015 paper Learning to Compare Image Patches via Convolutional Neural Networks.

But which is the best method to implement in order to compare similarity and use it for deploy, Constrastive Loss or Soft Max Loss — MasterWizard, Nov 26 '16 at 12:41
@AhmedNassar I've updated my answer for the qestion. Please see it. — Dale, Nov 26 '16 at 17:48
Thank you, this is very informative! I think I am on the right track, should my deploy be the same as the training prototext? — MasterWizard, Nov 28 '16 at 10:52
@MasterWizard Generally when deploying you should remove the loss layer in the prototxt. If you use a Softmaxloss, you can remove the loss and add a `ARGMAX` layer, and it will give you the predicted label. For Constrastive Loss, you can remove it directly in the prototxt and the output will be the feature vector, then in your testbed/app you can process the vector to get the similarity. — Dale, Nov 28 '16 at 13:55
Thank you greatly, any link to the tutorial of feature vector with euclidean distance on pycaffe or similar? — MasterWizard, Nov 28 '16 at 13:59
@MasterWizard This post http://stackoverflow.com/q/1401712/6281477 should be what you need. : ) — Dale, Nov 28 '16 at 14:04
Oh cool, so just put the feature vectors fc7 and fc7_p into numpy arrays and use any euclidean? Thanks again — MasterWizard, Nov 28 '16 at 14:08
@MasterWizard Yes. Comupute the euclidean distance of `fc7` and `fc7_p` and compare it with the field `margin`'s value(default `1`). (You can set its value in the Constrastive layer in the prototxt before training.) — Dale, Nov 28 '16 at 14:17
is there a logical reason why my layers fc7 and fc7_p vectors are arrays full of zeros but my fc6 and fc7_p contain vectors... — MasterWizard, Nov 29 '16 at 16:39
@MasterWizard Did your model converge in training? Did you check that weights in `fc7`, `fc7_p` are not zeros? And you'd better scale your image data to `[0, 1]` by adding `transform_param { scale: 0.00390625 }` in your data layer. — Dale, Nov 30 '16 at 01:36
Supposedly I'm not supposed to share weights between both networks, so should i remove "conv1_w" and "conv1_b"? — MasterWizard, Nov 30 '16 at 09:28
@MasterWizard Why wouldn't you share the weights? If you won't, you should remove the "conv1_w", etc. — Dale, Nov 30 '16 at 10:00
what is weird is that fc7 params/weights are available, but its feature vectors are all zeroes. — MasterWizard, Nov 30 '16 at 14:41
you might find [this thread](http://stackoverflow.com/q/40510706/1714410) useful for debugging. — Shai, Dec 01 '16 at 07:19

score 1 · Answer 2 · edited May 23 '17 at 12:09

1

Just to correct Dale's great answer above for Caffe's uber sensitive syntax, for noobs that get stuck like myself, here's a few corrections (layers to layer, some quotes, plus removal of comments, and valid capitalization)

layer {
  name: "concat"
  type: "Concat"
  bottom: "fc7"
  bottom: "fc7_p"  
  top: "fc_concat"
}
layer {
  name: "fc_cls"
  type: "InnerProduct"
  bottom: "fc_concat"
  top: "fc_cls"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "fc_cls"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "fc_cls"
  bottom: "label"
  top: "loss"
}

edited May 23 '17 at 12:09

Community

1
1

answered Dec 11 '16 at 04:42

Joel Teply

3,260
1
31
21

@Dale it seems like you are using old caffe prototxt syntax. – Shai Dec 11 '16 at 06:58
@Shai yes and I wanted to keep consistent with the question. – Dale Dec 11 '16 at 11:24
I was wondering, good call. Do you have a good place to strategize improving siamese performance? I can get up to 86% accuracy with some 40x40 squares but I need to hopefully get into mid 90's. Making it deeper seems to not add much. – Joel Teply Dec 11 '16 at 23:29

score 0 · Answer 3 · answered Feb 17 '17 at 17:23

I believe num_output defines the dimension of the extracted feature vector and then extracted feature can be used to determine the L2 distance. If the L2 distance is greater than 1 then it is a different class and if it is close to 0 the image is similar. Rest the Dale answer is perfect.

Siamese network output

3 Answers3

Update