BVLC · shelhamer · Jan 30, 2015 · Dec 30, 2014 · Dec 30, 2014 · Dec 30, 2014
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
@@ -696,6 +696,14 @@ template <typename Dtype> class SoftmaxLayer;
 template <typename Dtype>
 class SoftmaxWithLossLayer : public LossLayer<Dtype> {
  public:
+   /**
+    * @param param provides LossParameter loss_param, with options:
+    *  - ignore_label (optional)
+    *    Specify a label value that should be ignored when computing the loss.
+    *  - normalize (optional, default true)
+    *    If true, the loss is normalized by the number of (nonignored) labels
+    *    present; otherwise the loss is simply summed over spatial locations.
+    */
   explicit SoftmaxWithLossLayer(const LayerParameter& param)
       : LossLayer<Dtype>(param),
         softmax_layer_(new SoftmaxLayer<Dtype>(param)) {}
@@ -758,6 +766,13 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
   vector<Blob<Dtype>*> softmax_bottom_vec_;
   /// top vector holder used in call to the underlying SoftmaxLayer::Forward
   vector<Blob<Dtype>*> softmax_top_vec_;
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// Whether to normalize the loss by the total number of values present
+  /// (otherwise just by the batch size).
+  bool normalize_;
 };
 
 }  // namespace caffe

diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
@@ -17,6 +17,13 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
   softmax_top_vec_.clear();
   softmax_top_vec_.push_back(&prob_);
   softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  normalize_ = this->layer_param_.loss_param().normalize();
 }
 
 template <typename Dtype>
@@ -40,27 +47,34 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
   int num = prob_.num();
   int dim = prob_.count() / num;
   int spatial_dim = prob_.height() * prob_.width();
+  int count = 0;
   Dtype loss = 0;
   for (int i = 0; i < num; ++i) {
     for (int j = 0; j < spatial_dim; j++) {
       const int label_value = static_cast<int>(label[i * spatial_dim + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
       DCHECK_GE(label_value, 0);
-      DCHECK_GT(dim, label_value * spatial_dim);
-      loss -= log(std::max(prob_data[i * dim +
-          label_value * spatial_dim + j],
+      DCHECK_LT(label_value, prob_.channels());
+      loss -= log(std::max(prob_data[i * dim + label_value * spatial_dim + j],
                            Dtype(FLT_MIN)));
+      ++count;
     }
   }
-  top[0]->mutable_cpu_data()[0] = loss / num / spatial_dim;
+  if (normalize_) {
+    top[0]->mutable_cpu_data()[0] = loss / count;
+  } else {
+    top[0]->mutable_cpu_data()[0] = loss / num;
+  }
   if (top.size() == 2) {
     top[1]->ShareData(prob_);
   }
 }
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[1]) {
     LOG(FATAL) << this->type_name()
                << " Layer cannot backpropagate to label inputs.";
@@ -73,15 +87,27 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     int num = prob_.num();
     int dim = prob_.count() / num;
     int spatial_dim = prob_.height() * prob_.width();
+    int count = 0;
     for (int i = 0; i < num; ++i) {
       for (int j = 0; j < spatial_dim; ++j) {
-        bottom_diff[i * dim + static_cast<int>(label[i * spatial_dim + j])
-            * spatial_dim + j] -= 1;
+        const int label_value = static_cast<int>(label[i * spatial_dim + j]);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int c = 0; c < bottom[0]->channels(); ++c) {
+            bottom_diff[i * dim + c * spatial_dim + j] = 0;
+          }
+        } else {
+          bottom_diff[i * dim + label_value * spatial_dim + j] -= 1;
+          ++count;
+        }
       }
     }
     // Scale gradient
     const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_scal(prob_.count(), loss_weight / num / spatial_dim, bottom_diff);
+    if (normalize_) {
+      caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
+    } else {
+      caffe_scal(prob_.count(), loss_weight / num, bottom_diff);
+    }
   }
 }
 

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -206,7 +206,7 @@ message NetStateRule {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available ID: 42 (last added: exp_param)
+// LayerParameter next available ID: 43 (last added: loss_param)
 message LayerParameter {
   repeated string bottom = 2; // the name of the bottom blobs
   repeated string top = 3; // the name of the top blobs
@@ -331,6 +331,9 @@ message LayerParameter {
   // Parameters for data pre-processing.
   optional TransformationParameter transform_param = 36;
 
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 42;
+
   // Note: certain layers may have more than one computational engine
   // for their implementation. These layers include an Engine type and
   // engine parameter for selecting the implementation.
@@ -361,6 +364,15 @@ message TransformationParameter {
   repeated float mean_value = 5;
 }
 
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // If true, normalize each batch across all instances (including spatial
+  // dimesions, but not ignored instances); else, divide by batch size only.
+  optional bool normalize = 2 [default = true];
+}
+
 // Message that stores parameters used by AccuracyLayer
 message AccuracyParameter {
   // When computing accuracy, count as correct by comparing the true label to

diff --git a/src/caffe/test/test_softmax_with_loss_layer.cpp b/src/caffe/test/test_softmax_with_loss_layer.cpp
@@ -3,6 +3,7 @@
 #include <cstring>
 #include <vector>
 
+#include "boost/scoped_ptr.hpp"
 #include "gtest/gtest.h"
 
 #include "caffe/blob.hpp"
@@ -13,6 +14,8 @@
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"
 
+using boost::scoped_ptr;
+
 namespace caffe {
 
 template <typename TypeParam>
@@ -50,7 +53,6 @@ class SoftmaxWithLossLayerTest : public MultiDeviceTest<TypeParam> {
 
 TYPED_TEST_CASE(SoftmaxWithLossLayerTest, TestDtypesAndDevices);
 
-
 TYPED_TEST(SoftmaxWithLossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -61,4 +63,48 @@ TYPED_TEST(SoftmaxWithLossLayerTest, TestGradient) {
       this->blob_top_vec_, 0);
 }
 
+TYPED_TEST(SoftmaxWithLossLayerTest, TestForwardIgnoreLabel) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_loss_param()->set_normalize(false);
+  // First, compute the loss with all labels
+  scoped_ptr<SoftmaxWithLossLayer<Dtype> > layer(
+      new SoftmaxWithLossLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  Dtype full_loss = this->blob_top_loss_->cpu_data()[0];
+  // Now, accumulate the loss, ignoring each label in {0, ..., 4} in turn.
+  Dtype accum_loss = 0;
+  for (int label = 0; label < 5; ++label) {
+    layer_param.mutable_loss_param()->set_ignore_label(label);
+    layer.reset(new SoftmaxWithLossLayer<Dtype>(layer_param));
+    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    accum_loss += this->blob_top_loss_->cpu_data()[0];
+  }
+  // Check that each label was included all but once.
+  EXPECT_NEAR(4 * full_loss, accum_loss, 1e-4);
+}
+
+TYPED_TEST(SoftmaxWithLossLayerTest, TestGradientIgnoreLabel) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  // labels are in {0, ..., 4}, so we'll ignore about a fifth of them
+  layer_param.mutable_loss_param()->set_ignore_label(0);
+  SoftmaxWithLossLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(SoftmaxWithLossLayerTest, TestGradientUnnormalized) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_loss_param()->set_normalize(false);
+  SoftmaxWithLossLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
 }  // namespace caffe