BVLC · shelhamer · Jul 29, 2014 · Jun 16, 2014 · Jun 30, 2014 · Jun 18, 2014
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
@@ -7,23 +7,27 @@ namespace caffe {
 
 template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, Dtype* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_im);
 
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, Dtype* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_im);
 
 }  // namespace caffe
 

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
@@ -45,11 +45,11 @@ class ConvolutionLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
-  int kernel_size_;
-  int stride_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
   int num_;
   int channels_;
-  int pad_;
+  int pad_h_, pad_w_;
   int height_;
   int width_;
   int num_output_;
@@ -119,12 +119,12 @@ class Im2colLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
-  int kernel_size_;
-  int stride_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
   int channels_;
   int height_;
   int width_;
-  int pad_;
+  int pad_h_, pad_w_;
 };
 
 /* InnerProductLayer

diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
@@ -14,10 +14,42 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   Layer<Dtype>::SetUp(bottom, top);
-  kernel_size_ = this->layer_param_.convolution_param().kernel_size();
-  stride_ = this->layer_param_.convolution_param().stride();
+  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+  CHECK(!conv_param.has_kernel_size() !=
+      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(conv_param.has_kernel_size() ||
+      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+      && conv_param.has_pad_w())
+      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+      && conv_param.has_stride_w())
+      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  } else {
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
+  } else {
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
+  }
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
+  } else {
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
+  }
   group_ = this->layer_param_.convolution_param().group();
-  pad_ = this->layer_param_.convolution_param().pad();
   num_ = bottom[0]->num();
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
@@ -37,17 +69,18 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_EQ(channels_ % group_, 0);
   // The im2col result buffer would only hold one image at a time to avoid
   // overly large memory usage.
-  int height_out = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
-  int width_out = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+  int height_out =
+      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+  int width_out = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
   col_buffer_.Reshape(
-      1, channels_ * kernel_size_ * kernel_size_, height_out, width_out);
+      1, channels_ * kernel_h_ * kernel_w_, height_out, width_out);
   // Set the parameters
   CHECK_EQ(num_output_ % group_, 0)
       << "Number of output should be multiples of group.";
   bias_term_ = this->layer_param_.convolution_param().bias_term();
   // Figure out the dimensions for individual gemms.
   M_ = num_output_ / group_;
-  K_ = channels_ * kernel_size_ * kernel_size_ / group_;
+  K_ = channels_ * kernel_h_ * kernel_w_ / group_;
   N_ = height_out * width_out;
   for (int top_id = 0; top_id < top->size(); ++top_id) {
     (*top)[top_id]->Reshape(num_, num_output_, height_out, width_out);
@@ -63,7 +96,7 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
     }
     // Intialize the weight
     this->blobs_[0].reset(new Blob<Dtype>(
-        num_output_, channels_ / group_, kernel_size_, kernel_size_));
+        num_output_, channels_ / group_, kernel_h_, kernel_w_));
     // fill the weights
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.convolution_param().weight_filler()));
@@ -99,7 +132,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     for (int n = 0; n < num_; ++n) {
       // First, im2col
       im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                        width_, kernel_size_, pad_, stride_, col_data);
+          width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+          col_data);
       // Second, innerproduct with groups
       for (int g = 0; g < group_; ++g) {
         caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
@@ -160,7 +194,8 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         // Since we saved memory in the forward pass by not storing all col
         // data, we will need to recompute them.
         im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
-                   width_, kernel_size_, pad_, stride_, col_data);
+                   width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                   stride_h_, stride_w_, col_data);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           for (int g = 0; g < group_; ++g) {
@@ -179,8 +214,9 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                 (Dtype)0., col_diff + col_offset * g);
           }
           // col2im back to the data
-          col2im_cpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
-              stride_, bottom_diff + (*bottom)[i]->offset(n));
+          col2im_cpu(col_diff, channels_, height_, width_,
+              kernel_h_, kernel_w_, pad_h_, pad_w_,
+              stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n));
         }
       }
     }

diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
@@ -24,7 +24,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     for (int n = 0; n < num_; ++n) {
       // First, im2col
       im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                        width_, kernel_size_, pad_, stride_, col_data);
+          width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+          col_data);
       // Second, innerproduct with groups
       for (int g = 0; g < group_; ++g) {
         caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
@@ -85,7 +86,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         // Since we saved memory in the forward pass by not storing all col
         // data, we will need to recompute them.
         im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
-                   width_, kernel_size_, pad_, stride_, col_data);
+                   width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                   stride_h_, stride_w_, col_data);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           for (int g = 0; g < group_; ++g) {
@@ -104,8 +106,9 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
                 (Dtype)0., col_diff + col_offset * g);
           }
           // col2im back to the data
-          col2im_gpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
-              stride_, bottom_diff + (*bottom)[i]->offset(n));
+          col2im_gpu(col_diff, channels_, height_, width_,
+              kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+              bottom_diff + (*bottom)[i]->offset(n));
         }
       }
     }

diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
@@ -13,15 +13,48 @@ template <typename Dtype>
 void Im2colLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   Layer<Dtype>::SetUp(bottom, top);
-  kernel_size_ = this->layer_param_.convolution_param().kernel_size();
-  stride_ = this->layer_param_.convolution_param().stride();
-  pad_ = this->layer_param_.convolution_param().pad();
+  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+  CHECK(!conv_param.has_kernel_size() !=
+      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(conv_param.has_kernel_size() ||
+      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+      && conv_param.has_pad_w())
+      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+      && conv_param.has_stride_w())
+      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  } else {
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
+  } else {
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
+  }
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
+  } else {
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
+  }
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
-  (*top)[0]->Reshape(bottom[0]->num(), channels_ * kernel_size_ * kernel_size_,
-      (height_ + 2 * pad_ - kernel_size_) / stride_ + 1,
-      (width_ + 2 * pad_ - kernel_size_) / stride_ + 1);
+  (*top)[0]->Reshape(
+      bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
+      (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
 }
 
 template <typename Dtype>
@@ -31,7 +64,8 @@ Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
   for (int n = 0; n < bottom[0]->num(); ++n) {
     im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
+        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, top_data + (*top)[0]->offset(n));
   }
   return Dtype(0.);
 }
@@ -43,7 +77,8 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
   for (int n = 0; n < top[0]->num(); ++n) {
     col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
+        kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n));
   }
 }
 

diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
@@ -16,7 +16,8 @@ Dtype Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
   for (int n = 0; n < bottom[0]->num(); ++n) {
     im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
+        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, top_data + (*top)[0]->offset(n));
   }
   return Dtype(0.);
 }
@@ -28,7 +29,8 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
   for (int n = 0; n < top[0]->num(); ++n) {
     col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
+        kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n));
   }
 }
 

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -251,10 +251,18 @@ message ConcatParameter {
 message ConvolutionParameter {
   optional uint32 num_output = 1; // The number of outputs for the layer
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
-  optional uint32 pad = 3 [default = 0]; // The padding size
-  optional uint32 kernel_size = 4; // The kernel size
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 3 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 4; // The kernel size (square)
+  optional uint32 kernel_h = 11; // The kernel height
+  optional uint32 kernel_w = 12; // The kernel width
   optional uint32 group = 5 [default = 1]; // The group size for group conv
-  optional uint32 stride = 6 [default = 1]; // The stride
+  optional uint32 stride = 6 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 13; // The stride height
+  optional uint32 stride_w = 14; // The stride width
   optional FillerParameter weight_filler = 7; // The filler for the weight
   optional FillerParameter bias_filler = 8; // The filler for the bias
 }