Merge pull request #4709 from chengduoZH/Add_conv3d_gemm_op

Add 3D Convolution operator implemented by GEMM.

Merge pull request #4709 from chengduoZH/Add_conv3d_gemm_op
Add 3D Convolution operator implemented by GEMM.
f78731c8 · chengduo · GitHub · c8f1389b · 87eb0714 · f78731c8
8 changed file
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -69,6 +69,13 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
    endif()

+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
+
    # conv_transpose_op contains several operators
    if ("${TARGET}" STREQUAL "conv_transpose_op")
        set(pybind_flag 1)
@@ -146,6 +153,7 @@ set(DEPS_OPS
    sum_op
    pool_op
    pool_with_index_op
+    conv_op
    lstm_op
    conv_transpose_op
    nccl_op
@@ -158,6 +166,7 @@ set(DEPS_OPS
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(conv_op DEPS vol2col)
 op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)

--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"

 namespace paddle {
 namespace operators {
@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
-            ops::Conv2DOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
+            ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(conv_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv_cudnn_grad,
-    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memory.h"
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cudnn_helper.h"


--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -12,18 +12,18 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"

 namespace paddle {
 namespace operators {

-void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
+void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DOp should not be null.");
+                 "Input(Input) of ConvOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DOp should not be null.");
+                 "Input(Filter) of ConvOp should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DOp should not be null.");
+                 "Output(Output) of ConvOp should not be null.");

  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
  int input_channels = in_dims[1];
  int output_channels = filter_dims[0];

-  PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "Conv intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(), filter_dims.size(),
+      "Conv input dimension and filter dimension should be the same.");
+  PADDLE_ENFORCE(
+      in_dims.size() - strides.size() == 2U,
+      "Conv input dimension and strides dimension should be consistent.");
+  PADDLE_ENFORCE_EQ(
+      paddings.size(), strides.size(),
+      "Conv paddings dimension and Conv strides dimension should be the same.");
  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
                    "The number of input channels should be equal to filter "
                    "channels * groups.");
@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
      output_channels % groups, 0,
      "The number of output channels should be divided by groups.");

-  auto output_height =
-      OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
-  auto output_width =
-      OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
-  ctx->SetOutputDim("Output",
-                    {in_dims[0], filter_dims[0], output_height, output_width});
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }

 Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
@@ -55,19 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
-      "The input tensor of convolution operator. "
+      "(Tensor) The input tensor of convolution operator. "
      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H is the height of the image, "
-      "and W is the width of the image.");
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
  AddInput("Filter",
-           "The filter tensor of convolution operator. "
+           "(Tensor) The filter tensor of convolution operator. "
           "The format of the filter tensor is MCHW, where M is the number of "
           "output image channels, C is the number of input image channels, "
           "H is the height of the filter, and W is the width of the filter. "
           "If the groups attribute is greater than 1, C equals the number of "
           "input image channels divided by the groups.");
  AddOutput("Output",
-            "The output tensor of convolution operator. "
+            "(Tensor) The output tensor of convolution operator. "
            "The format of output tensor is also NCHW.");
  AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
      .SetDefault({1, 1});
@@ -75,7 +84,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
      .SetDefault({0, 0});
  AddAttr<int>(
      "groups",
-      "Group size of convolution operator. "
+      "(int default:1), the group size of convolution operator. "
      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
      "when group=2, the first half of the filters is only connected to the "
      "first half of the input channels, while the second half of the filters "
@@ -84,14 +93,91 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
  AddComment(R"DOC(
 Convolution Operator.

-The convolution operation calculates the output based on the input, filter, 
-strides, paddings, and groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape method.
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature. Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: (N, C_in, H_in, W_in)
+       Filter shape: (C_out, C_in, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, H_out, W_out)
+  where
+       H_out = (H_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
+       W_out = (W_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
+)DOC");
+}

+Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is the "
+      "number of channels, D is the depth of the feature, H is the height of "
+      "the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "D is the depth of the filter, H is the height of the filter, and W "
+           "is the width of the filter."
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator."
+            "The format of output tensor is also NCDHW.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector, default:{0, 0, 0}), the strides of convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector, default:{0, 0, 0}), the paddings of convolution operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the group size of convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+
+  AddComment(R"DOC(
+Convolution3D Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+size, C is the number of channels,D is the depth of the feature, H is the height of
+the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
+are three elements. These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: (N, C_in, D_in, H_in, W_in)
+       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, D_out, H_out, W_out)
+  where
+       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
+       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
+       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
 )DOC");
 }

-void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  if (ctx->HasOutput(framework::GradVarName("Input"))) {
@@ -106,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::Conv2DOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::Conv2DOpGrad);
+REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+            ops::ConvOpGrad);
+namespace ops = paddle::operators;
+REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
+            ops::ConvOpGrad);

+REGISTER_OP_CPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP_CPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
@@ -12,11 +12,16 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"

 namespace ops = paddle::operators;

+REGISTER_OP_GPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"

 namespace paddle {
 namespace operators {
@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
                framework::OpAttrChecker* op_checker);
 };

-class Conv2DOp : public framework::OperatorWithKernel {
+class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class ConvOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override;
 };

-class Conv2DOpGrad : public framework::OperatorWithKernel {
+class ConvOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
 };

 template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel<T> {
+class GemmConvKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");

-    int batch_size = input->dims()[0];
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output->dims()[1];
-    int output_height = output->dims()[2];
-    int output_width = output->dims()[3];
-
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);
+
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+    output_shape_vec.erase(output_shape_vec.begin(),
+                           output_shape_vec.begin() + 2);
+
    // use col_shape in the im2col calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
-                                 filter_width, output_height, output_width};
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(input->dims()[1] / groups);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_height * filter_width,
-        output_height * output_width};
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
+    // o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+
    Tensor col;
    col.mutable_data<T>(col_shape, context.GetPlace());
    // col_matrix shares the same piece of data with col,
    // but will be reshaped into a two-dimensional matrix shape
    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);

-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
-                                   input->dims()[3]};
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);

-    framework::DDim output_matrix_shape = {output_channels,
-                                           output_height * output_width};
-    // convolution operator: im2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
+    framework::DDim output_matrix_shape = {
+        output->dims()[1],
+        output->numel() / (output->dims()[0] * output->dims()[1])};
+
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
+
    for (int i = 0; i < batch_size; i++) {
      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
      for (int g = 0; g < groups; g++) {
-        // im2col
        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               paddings[0], paddings[0], paddings[1], paddings[1]);
+
+        if (filter_shape_vec.size() == 2) {
+          // im2col
+          math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+          im2col(context.device_context(), in_slice, col, strides[0],
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
+        } else if (filter_shape_vec.size() == 3) {
+          // vol2col
+          math::Vol2ColFunctor<Place, T> vol2col;
+          vol2col(context.device_context(), in_slice, col, strides[0],
+                  strides[1], strides[2], paddings[0], paddings[1],
+                  paddings[2]);
+        }

        // gemm
        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
 };

 template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel<T> {
+class GemmConvGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
        context.Output<Tensor>(framework::GradVarName("Input"));
    Tensor* filter_grad =
        context.Output<Tensor>(framework::GradVarName("Filter"));
-
    // The filter and filter_grad will be reshaped in the calculations,
    // so here use an assignment operation,
    // that avoids modifying the variable in the Scope.
    Tensor filter = *context.Input<Tensor>("Filter");

+    if (!input_grad && !filter_grad) return;
+
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");

-    int batch_size = input->dims()[0];
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output_grad->dims()[1];
-    int output_height = output_grad->dims()[2];
-    int output_width = output_grad->dims()[3];
-
-    paddle::operators::math::Col2ImFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        col2im;
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
-    // use col_shape in the im2col and col2im calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
-                                 filter_width, output_height, output_width};
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);
+
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(
+        framework::vectorize(output_grad->dims()));
+    output_shape_vec.erase(output_shape_vec.begin(),
+                           output_shape_vec.begin() + 2);
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(input->dims()[1] / groups);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_height * filter_width,
-        output_height * output_width};
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
-    col_matrix.Resize(col_matrix_shape);
+    // size: (i_c/g * k_h * k_w, o_h * o_w)
+    // or
+    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);

-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
-                                   input->dims()[3]};
-    framework::DDim output_matrix_shape = {
-        output_grad->dims()[1],
-        output_grad->dims()[2] * output_grad->dims()[3]};
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));

    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);

-    // convolution backward input operator:  gemm + col2im
-    // convolution backward weight operator: im2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
+    framework::DDim output_matrix_shape = {
+        output_grad->dims()[1],
+        output_grad->numel() /
+            (output_grad->dims()[0] * output_grad->dims()[1])};
+
+    // convolution backward input operator:  gemm + col2im(or col2vol)
+    // convolution backward weight operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
+
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    math::SetConstant<Place, T> set_zero;

    if (input_grad) {
      input_grad->mutable_data<T>(context.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      set_zero(context.device_context(), input_grad, static_cast<T>(0));

      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
          math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                 out_grad_slice, false, T(1.0), &col_matrix,
                                 T(0.0));
-
          // col2im
          Tensor in_grad_slice =
              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (filter_shape_vec.size() == 2) {
+            math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
            col2im(context.device_context(), in_grad_slice, col, strides[0],
                   strides[1], paddings[0], paddings[0], paddings[1],
                   paddings[1]);
+
+          } else if (filter_shape_vec.size() == 3) {
+            math::Col2VolFunctor<Place, T> col2vol;
+            col2vol(context.device_context(), in_grad_slice, col, strides[0],
+                    strides[1], strides[2], paddings[0], paddings[1],
+                    paddings[2]);
+          }
        }
      }
    }
@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(context.GetPlace());
      Tensor filter_grad_ = *filter_grad;
      filter_grad_.Resize(filter_matrix_shape);
-      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      set_zero(context.device_context(), filter_grad, static_cast<T>(0));

      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
          Tensor out_grad_slice =
              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (filter_shape_vec.size() == 2) {
+            math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
            im2col(context.device_context(), in_slice, col, strides[0],
                   strides[1], paddings[0], paddings[0], paddings[1],
                   paddings[1]);
+          } else if (filter_shape_vec.size() == 3) {
+            math::Vol2ColFunctor<Place, T> vol2col;
+            vol2col(context.device_context(), in_slice, col, strides[0],
+                    strides[1], strides[2], paddings[0], paddings[1],
+                    paddings[2]);
+          }

          // gemm
          Tensor filter_grad_slice =
@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
    }
  }
 };
-
 }  // namespace operators
 }  // namespace paddle
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
@@ -61,25 +61,23 @@ class TestConv2dOp(OpTest):

    def test_check_grad(self):
        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)

    def test_check_grad_no_filter(self):
        self.check_grad(
            ['Input'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Filter']))

    def test_check_grad_no_input(self):
        self.check_grad(
            ['Filter'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Input']))

    def init_test_case(self):
-        # self.groups = 1
-        # self.op_type = "conv2d"
        self.pad = [0, 0]
        self.stride = [1, 1]
        self.dilations = [1, 1]
@@ -103,6 +101,9 @@ class TestWithGroup(TestConv2dOp):
        self.op_type = "conv2d"


+#----------------Conv2dCudnn----------------
+
+
 class TestCudnn(TestConv2dOp):
    def init_group(self):
        self.groups = 1

--- a/python/paddle/v2/framework/tests/test_conv3d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv3d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_d, in_h, in_w = input.shape
+    out_c, f_c, f_d, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad = conv_param['stride'], conv_param['pad']
+    out_d = 1 + (in_d + 2 * pad[0] - f_h) / stride[0]
+    out_h = 1 + (in_h + 2 * pad[1] - f_h) / stride[1]
+    out_w = 1 + (in_w + 2 * pad[2] - f_w) / stride[2]
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ),
+                               (pad[2], )),
+                       mode='constant',
+                       constant_values=0)
+    for d in range(out_d):
+        for i in range(out_h):
+            for j in range(out_w):
+                for g in range(group):
+                    input_pad_masked = \
+                        input_pad[:, g * f_c:(g + 1) * f_c,
+                        d * stride[0]:d * stride[0] + f_d,
+                        i * stride[1]:i * stride[1] + f_h,
+                        j * stride[2]:j * stride[2] + f_w]
+                    f_sub = filter[g * sub_out_c:(g + 1) *
+                                   sub_out_c, :, :, :, :]
+                    for k in range(sub_out_c):
+                        out[:, g * sub_out_c + k, d, i, j] = \
+                            np.sum(input_pad_masked * f_sub[k, :, :, :, :],
+                                   axis=(1, 2, 3, 4))
+
+    return out
+
+
+class TestConv3dOp(OpTest):
+    def setUp(self):
+        self.init_group()
+        self.init_op_type()
+        self.init_test_case()
+
+        conv3d_param = {'stride': self.stride, 'pad': self.pad}
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv3d_forward_naive(input, filter, self.groups,
+                                      conv3d_param).astype("float32")
+
+        self.inputs = {'Input': input, 'Filter': filter}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestCase1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestWithGroup1(TestConv3dOp):
+    def init_group(self):
+        self.groups = 3
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestWithGroup2(TestCase1):
+    def init_group(self):
+        self.groups = 3
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+if __name__ == '__main__':
+    unittest.main()