write conv2d and conv3d together

f302c6a3 · chengduoZH · ba7db29d · f302c6a3 · f302c6a3 · f302c6a3
6 changed file
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -41,8 +41,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
            ops::ConvOpGrad);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(conv_cudnn,
-    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv_cudnn_grad,
-    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -198,12 +198,12 @@ namespace ops = paddle::operators;
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);
+REGISTER_OP_CPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d, ops::GemmConv3DKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGrad3DKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_op.cu
+++ b/paddle/operators/conv_op.cu
@@ -16,12 +16,12 @@
 namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv3d, ops::GemmConv3DKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGrad3DKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -62,7 +62,7 @@ class ConvOpGrad : public framework::OperatorWithKernel {
 };
 template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel<T> {
+class GemmConvKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -77,49 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");
-    int batch_size = input->dims()[0];
+    const int batch_size = static_cast<int>(input->dims()[0]);
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
-    int filter_width = filter.dims()[filter.dims().size() - 1];
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    int output_channels = output->dims()[1];
+    filter_shape_vec.erase(filter_shape_vec.begin(),
-    int output_height = output->dims()[2];
+                           filter_shape_vec.begin() + 2);
-    int output_width = output->dims()[3];
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+    output_shape_vec.erase(output_shape_vec.begin(),
+                           output_shape_vec.begin() + 2);
-    math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
    // use col_shape in the im2col calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-                                 filter_width, output_height, output_width};
+    // o_h, o_w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(input->dims()[1] / groups);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
-        input_channels / groups * filter_height * filter_width,
+    // o_h * o_w)
-        output_height * output_width};
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
    Tensor col;
    col.mutable_data<T>(col_shape, context.GetPlace());
    // col_matrix shares the same piece of data with col,
    // but will be reshaped into a two-dimensional matrix shape
    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
+    framework::DDim input_shape = framework::slice_ddim(
-                                   input->dims()[3]};
+        input->dims(), 1, static_cast<int>(input->dims().size()));
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {output_channels,
+    framework::DDim output_matrix_shape = {
-                                           output_height * output_width};
+        output->dims()[1],
-    // convolution operator: im2col + gemm
+        output->numel() / (output->dims()[0] * output->dims()[1])};
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
    for (int i = 0; i < batch_size; i++) {
      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
      for (int g = 0; g < groups; g++) {
-        // im2col
        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               paddings[0], paddings[0], paddings[1], paddings[1]);
+        if (filter_shape_vec.size() == 2) {
+          // im2col
+          math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+          im2col(context.device_context(), in_slice, col, strides[0],
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
+        } else if (filter_shape_vec.size() == 3) {
+          // vol2col
+          math::Vol2ColFunctor<Place, T> vol2col;
+          vol2col(context.device_context(), in_slice, col, strides[0],
+                  strides[1], strides[2], paddings[0], paddings[1],
+                  paddings[2]);
+        }
        // gemm
        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
@@ -132,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
 };
 template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel<T> {
+class GemmConvGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -142,267 +171,74 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
        context.Output<Tensor>(framework::GradVarName("Input"));
    Tensor* filter_grad =
        context.Output<Tensor>(framework::GradVarName("Filter"));
    // The filter and filter_grad will be reshaped in the calculations,
    // so here use an assignment operation,
    // that avoids modifying the variable in the Scope.
    Tensor filter = *context.Input<Tensor>("Filter");
+    if (!input_grad && !filter_grad) return;
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");
-    int batch_size = input->dims()[0];
+    const int batch_size = static_cast<int>(input->dims()[0]);
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output_grad->dims()[1];
-    int output_height = output_grad->dims()[2];
-    int output_width = output_grad->dims()[3];
-    math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
-    math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
-    // use col_shape in the im2col and col2im calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
-                                 filter_width, output_height, output_width};
-    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_height * filter_width,
-        output_height * output_width};
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
-    col_matrix.Resize(col_matrix_shape);
-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
-                                   input->dims()[3]};
-    framework::DDim output_matrix_shape = {
-        output_grad->dims()[1],
-        output_grad->dims()[2] * output_grad->dims()[3]};
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-    // convolution backward input operator:  gemm + col2im
-    // convolution backward weight operator: im2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
-    math::SetConstant<Place, T> set_zero;
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), input_grad, static_cast<T>(0));
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(context.device_context(), filter_slice, true,
-                                 out_grad_slice, false, T(1.0), &col_matrix,
-                                 T(0.0));
-          // col2im
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-          col2im(context.device_context(), in_grad_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
-                 paddings[1]);
-        }
-      }
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      set_zero(context.device_context(), filter_grad, static_cast<T>(0));
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-          im2col(context.device_context(), in_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
-                 paddings[1]);
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(context.device_context(), out_grad_slice,
-                                 false, col_matrix, true, T(1.0),
-                                 &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-template <typename Place, typename T>
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
-class GemmConv3DKernel : public framework::OpKernel<T> {
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
- public:
+    filter_shape_vec.erase(filter_shape_vec.begin(),
-  void Compute(const framework::ExecutionContext& context) const override {
+                           filter_shape_vec.begin() + 2);
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int64_t> output_shape_vec(
-    int groups = context.Attr<int>("groups");
+        framework::vectorize(output_grad->dims()));
+    output_shape_vec.erase(output_shape_vec.begin(),
+                           output_shape_vec.begin() + 2);
-    int batch_size = input->dims()[0];
+    // use col_shape in the im2col calculation
-    int input_channels = input->dims()[1];
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    int filter_depth = filter.dims()[filter.dims().size() - 3];
+    // o_h, o_w}
-    int filter_height = filter.dims()[filter.dims().size() - 2];
+    std::vector<int64_t> col_shape_vec;
-    int filter_width = filter.dims()[filter.dims().size() - 1];
+    col_shape_vec.push_back(input->dims()[1] / groups);
-    int output_channels = output->dims()[1];
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
-    int output_depth = output->dims()[2];
+                         filter_shape_vec.end());
-    int output_height = output->dims()[3];
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
-    int output_width = output->dims()[4];
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-    math::Vol2ColFunctor<Place, T> vol2col;
-    // use col_shape in the vol2col calculation
-    framework::DDim col_shape = {input_channels / groups,
-                                 filter_depth,
-                                 filter_height,
-                                 filter_width,
-                                 output_depth,
-                                 output_height,
-                                 output_width};
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
+    // size: (i_c/g * k_h * k_w, o_h * o_w)
-        input_channels / groups * filter_depth * filter_height * filter_width,
+    // or
-        output_depth * output_height * output_width};
+    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    Tensor col;
+    framework::DDim col_matrix_shape =
-    col.mutable_data<T>(col_shape, context.GetPlace());
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
-    col_matrix.Resize(col_matrix_shape);
-    framework::DDim input_shape = {
+    framework::DDim input_shape = framework::slice_ddim(
-        input->dims()[1], input->dims()[2], input->dims()[3],
+        input->dims(), 1, static_cast<int>(input->dims().size()));
-        input->dims()[4]};  // channel, depth, height, width
-    framework::DDim filter_matrix_shape = {
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
-        filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
-        filter.numel() / filter.dims()[0]};  // filter_out_channel,
-    // filter_in_channel*filter_depth*filter_height*filter_width
    filter.Resize(filter_matrix_shape);
    framework::DDim output_matrix_shape = {
-        output_channels, output_depth * output_height * output_width};
+        output_grad->dims()[1],
+        output_grad->numel() /
-    // convolution operator: vol2col + gemm
+            (output_grad->dims()[0] * output_grad->dims()[1])};
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-      for (int g = 0; g < groups; g++) {
-        // vol2col
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        vol2col(context.device_context(), in_slice, col, strides[0], strides[1],
-                strides[2], paddings[0], paddings[1], paddings[2]);
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(context.device_context(), filter_slice, false,
-                               col_matrix, false, T(1.0), &out_slice, T(0.0));
-      }
-    }
-  }
-};
-template <typename Place, typename T>
-class GemmConvGrad3DKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    int groups = context.Attr<int>("groups");
-    int batch_size = input->dims()[0];
+    // convolution backward input operator:  gemm + col2im(or col2vol)
-    int input_channels = input->dims()[1];
+    // convolution backward weight operator: im2col(or vol2col) + gemm
-    int filter_depth = filter.dims()[filter.dims().size() - 3];
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int filter_height = filter.dims()[filter.dims().size() - 2];
+    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output_grad->dims()[1];
-    int output_depth = output_grad->dims()[2];
-    int output_height = output_grad->dims()[3];
-    int output_width = output_grad->dims()[4];
-    math::Col2VolFunctor<Place, T> col2vol;
-    math::Vol2ColFunctor<Place, T> vol2col;
-    // use col_shape in the vol2col and col2vol calculation
-    framework::DDim col_shape = {input_channels / groups,
-                                 filter_depth,
-                                 filter_height,
-                                 filter_width,
-                                 output_depth,
-                                 output_height,
-                                 output_width};
-    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_depth * filter_height * filter_width,
-        output_depth * output_height * output_width};
    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
    // col_matrix shares the same piece of data with col,
    // but will be reshaped into a two-dimensional matrix shape
    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
+    Tensor col_matrix;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
-    framework::DDim input_shape = {
-        input->dims()[1], input->dims()[2], input->dims()[3],
-        input->dims()[4]};  // channel, depth, height, width
-    framework::DDim output_matrix_shape = {output_grad->dims()[1],
-                                           output_grad->dims()[2] *
-                                               output_grad->dims()[3] *
-                                               output_grad->dims()[4]};
-    framework::DDim filter_matrix_shape = {
-        filter.dims()[0],
-        filter.numel() / filter.dims()[0]};  // filter_out_channel,
-    // filter_in_channel*filter_depth*filter_height*filter_width
-    filter.Resize(filter_matrix_shape);
-    // convolution backward input operator:  gemm + col2vol
-    // convolution backward weight operator: vol2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
    math::SetConstant<Place, T> set_zero;
    if (input_grad) {
@@ -421,16 +257,25 @@ class GemmConvGrad3DKernel : public framework::OpKernel<T> {
          math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                 out_grad_slice, false, T(1.0), &col_matrix,
                                 T(0.0));
+          // col2im
-          // col2vol
          Tensor in_grad_slice =
              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+          if (filter_shape_vec.size() == 2) {
+            math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+            col2im(context.device_context(), in_grad_slice, col, strides[0],
+                   strides[1], paddings[0], paddings[0], paddings[1],
+                   paddings[1]);
+          } else if (filter_shape_vec.size() == 3) {
+            math::Col2VolFunctor<Place, T> col2vol;
            col2vol(context.device_context(), in_grad_slice, col, strides[0],
                    strides[1], strides[2], paddings[0], paddings[1],
                    paddings[2]);
          }
        }
      }
+    }
    if (filter_grad) {
      filter_grad->mutable_data<T>(context.GetPlace());
@@ -443,13 +288,22 @@ class GemmConvGrad3DKernel : public framework::OpKernel<T> {
            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
        for (int g = 0; g < groups; g++) {
-          // vol2col
+          // im2col
          Tensor out_grad_slice =
              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+          if (filter_shape_vec.size() == 2) {
+            math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+            im2col(context.device_context(), in_slice, col, strides[0],
+                   strides[1], paddings[0], paddings[0], paddings[1],
+                   paddings[1]);
+          } else if (filter_shape_vec.size() == 3) {
+            math::Vol2ColFunctor<Place, T> vol2col;
            vol2col(context.device_context(), in_slice, col, strides[0],
                    strides[1], strides[2], paddings[0], paddings[1],
                    paddings[2]);
+          }
          // gemm
          Tensor filter_grad_slice =
@@ -462,6 +316,5 @@ class GemmConvGrad3DKernel : public framework::OpKernel<T> {
    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
@@ -61,25 +61,23 @@ class TestConv2dOp(OpTest):
    def test_check_grad(self):
        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
    def test_check_grad_no_filter(self):
        self.check_grad(
            ['Input'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Filter']))
    def test_check_grad_no_input(self):
        self.check_grad(
            ['Filter'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Input']))
    def init_test_case(self):
-        # self.groups = 1
-        # self.op_type = "conv2d"
        self.pad = [0, 0]
        self.stride = [1, 1]
        self.dilations = [1, 1]

--- a/python/paddle/v2/framework/tests/test_conv3d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_op.py
@@ -64,20 +64,20 @@ class TestConv3dOp(OpTest):
    def test_check_grad(self):
        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
    def test_check_grad_no_filter(self):
        self.check_grad(
            ['Input'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.03,
            no_grad_set=set(['Filter']))
    def test_check_grad_no_input(self):
        self.check_grad(
            ['Filter'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.03,
            no_grad_set=set(['Input']))
    def init_test_case(self):