diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5fd9c70c04e8b09357b77e0fe7531e5908345463
--- /dev/null
+++ b/paddle/fluid/operators/fold_op.cc
@@ -0,0 +1,274 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/fluid/operators/fold_op.h"
+#include "paddle/fluid/operators/unfold_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FoldOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::NotFound("Input(X) of FoldOp should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("Y"), true,
+        platform::errors::NotFound("Output(Y) of FoldOp should not be null"));
+    auto in_dims = ctx->GetInputDim("X");
+    std::vector<int> output_sizes =
+        ctx->Attrs().Get<std::vector<int>>("output_sizes");
+    std::vector<int> kernel_sizes =
+        ctx->Attrs().Get<std::vector<int>>("kernel_sizes");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+
+    PADDLE_ENFORCE_EQ(
+        output_sizes.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected output_size equals to 2, but got size %d",
+            output_sizes.size()));
+    PADDLE_ENFORCE_EQ(
+        kernel_sizes.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected kernel_size equals to 2, but got size %d",
+            kernel_sizes.size()));
+    PADDLE_ENFORCE_EQ(
+        strides.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected strides_size equals to 2, but got size %d",
+            strides.size()));
+    PADDLE_ENFORCE_EQ(
+        paddings.size(), 4,
+        platform::errors::InvalidArgument(
+            "It is expected paddings_size equals to 4, but got size %d",
+            paddings.size()));
+    PADDLE_ENFORCE_EQ(
+        dilations.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected dilations_size equals to 2, but got size %d",
+            dilations.size()));
+
+    int output_height = output_sizes[0];
+    int output_width = output_sizes[1];
+    int kernel_height = kernel_sizes[0];
+    int kernel_width = kernel_sizes[1];
+    int dilation_height = dilations[0];
+    int dilation_width = dilations[1];
+    int stride_height = strides[0];
+    int stride_width = strides[1];
+
+    // check kernel_sizes
+    PADDLE_ENFORCE_GT(kernel_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The `kernel_sizes` should be greater than zero, "
+                          "but recieved kernel_height: %d kernel_width: %d.",
+                          kernel_sizes[0], kernel_sizes[1]));
+    PADDLE_ENFORCE_GT(kernel_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The `kernel_sizes` should be greater than zero, "
+                          "but recieved kernel_height: %d kernel_width: %d.",
+                          kernel_sizes[0], kernel_sizes[1]));
+    // check strides
+    PADDLE_ENFORCE_GT(stride_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The `strides` should be greater than zero, "
+                          "but recieved strides_height: %d strides_width: %d.",
+                          strides[0], strides[1]));
+    PADDLE_ENFORCE_GT(stride_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The `strides` should be greater than zero, "
+                          "but recieved strides_height: %d strides_width: %d.",
+                          strides[0], strides[1]));
+    // check dilations
+    PADDLE_ENFORCE_GT(
+        dilation_height, 0,
+        platform::errors::InvalidArgument(
+            "The `dilations` should be greater than zero, "
+            "but recieved dilations_height: %d dilations_width: %d.",
+            dilations[0], dilations[1]));
+    PADDLE_ENFORCE_GT(
+        dilation_width, 0,
+        platform::errors::InvalidArgument(
+            "The `dilations` should be greater than zero, "
+            "but recieved dilations_height: %d dilations_width: %d.",
+            dilations[0], dilations[1]));
+
+    std::vector<int> out_dims;
+    // batch_size
+    out_dims.push_back(in_dims[0]);
+    // output_plane
+    int output_channels = in_dims[1] / (kernel_width * kernel_height);
+    out_dims.push_back(output_channels);
+
+    int blocks_height = (output_sizes[0] + 2 * paddings[0] -
+                         (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                            strides[0] +
+                        1;
+    int blocks_width = (output_sizes[1] + 2 * paddings[1] -
+                        (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                           strides[1] +
+                       1;
+
+    // check output height and width
+    PADDLE_ENFORCE_GT(
+        blocks_height, 0,
+        platform::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size (%d, %d), "
+            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
+            "is (%d, %d), which should be a positive integer.",
+            in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
+            strides[0], strides[1], dilations[0], dilations[1], output_height,
+            output_width));
+
+    PADDLE_ENFORCE_GT(
+        blocks_width, 0,
+        platform::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size (%d, %d), "
+            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
+            "is (%d, %d), which should be a positive integer.",
+            in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
+            strides[0], strides[1], dilations[0], dilations[1], output_height,
+            output_width));
+
+    PADDLE_ENFORCE_EQ(
+        blocks_height * blocks_width, in_dims[1],
+        platform::errors::InvalidArgument(
+            "Given input output_size (%d, %d), "
+            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
+            "which should be expected size of input's dimension "
+            "2 to match the calculated number of %d * %d = %d, but got %d",
+            output_height, output_width, kernel_sizes[0], kernel_sizes[1],
+            strides[0], strides[1], dilations[0], dilations[1], blocks_height,
+            blocks_width, blocks_height * blocks_width, in_dims[2]));
+
+    out_dims.push_back(output_height);
+    out_dims.push_back(output_width);
+    ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class FoldOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "Tensor, "
+             "the input of fold op. "
+             "The format of X is [N, C_in, L], "
+             "where N is the batch size, C_in is the input channels, "
+             "L is the length");
+    AddOutput("Y",
+              "Tensor, "
+              "the output of unfold op. "
+              "The format of Y is [N, C_out, output_height, output_width], "
+              "where N is the batch size, "
+              "C_in is the output channels of Y, output_height and "
+              "output_width "
+              "is the calculated height and width of output feature map.");
+    AddAttr<std::vector<int>>(
+        "output_sizes",
+        "vector<int>, the output sizes of the convolution operator.");
+    AddAttr<std::vector<int>>(
+        "kernel_sizes",
+        "vector<int>, the kernel sizes of the convolution operator.");
+    AddAttr<std::vector<int>>(
+        "strides", "vector<int>, the strides of the convolution operator.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "vector<int>, the paddings applied to pad the feature map.");
+    AddAttr<std::vector<int>>(
+        "dilations", "vector<int>, the dilations of the convolution operator.");
+    AddComment(R"DOC(
+**Fold Operator**
+
+This Operator is used to combines an array of sliding local blocks into a large containing
+tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
+combined value in the resulting large tensor by summing all values from all containing blocks. 
+Unfold extracts the values in the local blocks by copying from the large tensor. So, if the 
+blocks overlap, they are not inverses of each other.
+    )DOC");
+  }
+};
+
+class FoldGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Y")), true,
+        platform::errors::NotFound("The gradient of Y should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::NotFound("The input X should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::NotFound("The gradient of X should not be null"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Y")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class FoldGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fold_grad");
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetInput("X", this->Input("X"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(FoldGradOpNoNeedBufferVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fold, ops::FoldOp, ops::FoldOpMaker,
+                  ops::FoldGradMaker<paddle::framework::OpDesc>,
+                  ops::FoldGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fold_grad, ops::FoldGradOp,
+                  ops::FoldGradOpNoNeedBufferVarsInferer);
+
+REGISTER_OP_CPU_KERNEL(
+    fold, ops::FoldOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FoldOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    fold_grad, ops::FoldGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FoldGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fold_op.cu b/paddle/fluid/operators/fold_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2aa0728c6251e104e30a7c554a8aee05a588dcc
--- /dev/null
+++ b/paddle/fluid/operators/fold_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fold_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    fold, ops::FoldOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FoldOpKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fold_grad,
+    ops::FoldGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FoldGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d37edbfe803753b782a233052a946f30152cc524
--- /dev/null
+++ b/paddle/fluid/operators/fold_op.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FoldOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("X");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    Tensor* output = ctx.Output<Tensor>("Y");
+    output->mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> output_sizes = ctx.Attr<std::vector<int>>("output_sizes");
+    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+
+    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto input_dims = input->dims();
+
+    int output_height = (output_sizes[0] + 2 * paddings[0] -
+                         (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                            strides[0] +
+                        1;
+    int output_width = (output_sizes[1] + 2 * paddings[1] -
+                        (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                           strides[1] +
+                       1;
+
+    int n_input_plane = input_dims[1];
+    int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
+
+    framework::DDim output_shape(
+        {n_output_plane, output_sizes[0], output_sizes[1]});
+
+    framework::DDim input_matrix_shape({input_dims[0], kernel_sizes[0],
+                                        kernel_sizes[1], output_height,
+                                        output_width});
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    for (int i = 0; i < batch_size; i++) {
+      Tensor out_batch =
+          output->Slice(i, i + 1).Resize(output_shape);  // im size=3
+      Tensor in_batch =
+          input->Slice(i, i + 1).Resize(input_matrix_shape);  // col size=5
+      col2im(dev_ctx, in_batch, dilations, strides, paddings, &out_batch);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FoldGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* output_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(ctx.GetPlace());
+
+    if ((!output_grad) || (!input_grad)) return;
+
+    std::vector<int> output_sizes = ctx.Attr<std::vector<int>>("output_sizes");
+    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input_grad->dims()[0]);
+
+    auto input_dims = input_grad->dims();
+
+    int output_height = (output_sizes[0] + 2 * paddings[0] -
+                         (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                            strides[0] +
+                        1;
+    int output_width = (output_sizes[1] + 2 * paddings[1] -
+                        (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                           strides[1] +
+                       1;
+
+    int n_input_plane = input_dims[1];
+    int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
+
+    framework::DDim output_shape(
+        {n_output_plane, output_sizes[0], output_sizes[1]});
+    framework::DDim input_matrix_shape({input_dims[0], kernel_sizes[0],
+                                        kernel_sizes[1], output_height,
+                                        output_width});
+
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    for (int i = 0; i < batch_size; i++) {
+      Tensor out_grad_batch = output_grad->Slice(i, i + 1).Resize(output_shape);
+      Tensor in_grad_batch =
+          input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+      im2col(dev_ctx, out_grad_batch, dilations, strides, paddings,
+             &in_grad_batch);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a59b413383f81959e9854d5735a87f7ff728cc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -0,0 +1,204 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import math
+import numpy as np
+import unittest
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+class TestFoldOp(OpTest):
+    """
+    This is for test on fold Op
+    """
+
+    def init_data(self):
+        self.batch_size = 3
+        self.input_channels = 3 * 2 * 2
+        self.length = 12
+        self.kernel_sizes = [2, 2]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+        self.dilations = [1, 1]
+        self.output_sizes = [4, 5]
+        input_shape = [self.batch_size, self.input_channels, self.length]
+        self.x = np.random.rand(*input_shape).astype(np.float64)
+
+    def calc_fold(self):
+        output_shape = [0] * 4
+        output_shape[0] = self.batch_size
+        output_shape[1] = int(self.input_channels /
+                              (self.kernel_sizes[0] * self.kernel_sizes[1]))
+        output_shape[2] = self.output_sizes[0]
+        output_shape[3] = self.output_sizes[1]
+        dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
+        dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
+        col_height = int((self.output_sizes[0] + self.paddings[0] +
+                          self.paddings[2] - dkernel_h) / self.strides[0]) + 1
+        col_width = int((self.output_sizes[1] + self.paddings[1] +
+                         self.paddings[3] - dkernel_w) / self.strides[1]) + 1
+        output = np.zeros(output_shape).astype(np.float64)
+        ############ calculate output ##############
+        for b in range(output_shape[0]):
+            for c in range(self.input_channels):
+                w_offset = int(c % self.kernel_sizes[1])
+                h_offset = int(
+                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0])
+                c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1])
+                for h in range(col_height):
+                    h_out = int(h * self.strides[0] - self.paddings[0] +
+                                h_offset * self.dilations[0])
+                    for w in range(col_width):
+                        w_out = int(w * self.strides[1] - self.paddings[1] +
+                                    w_offset * self.dilations[1])
+                        if (h_out >= 0 and h_out < self.output_sizes[0]) and (
+                                w_out >= 0 and w_out < self.output_sizes[1]):
+                            output[b, c_out, h_out, w_out] += self.x[
+                                b, c, w + col_width * h]
+
+        self.outputs = output
+
+    def set_data(self):
+        self.init_data()
+        self.calc_fold()
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
+        self.attrs = {
+            'kernel_sizes': self.kernel_sizes,
+            'paddings': self.paddings,
+            'dilations': self.dilations,
+            'strides': self.strides,
+            'output_sizes': self.output_sizes
+        }
+        self.outputs = {'Y': self.outputs}
+
+    def setUp(self):
+        self.op_type = 'fold'
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y')
+
+
+class TestFoldAPI(TestFoldOp):
+
+    #This is for test on paddle.nn.Fold
+
+    def setUp(self):
+        self.op_type = 'fold'
+        self.set_data()
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_api(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input = paddle.to_tensor(self.x)
+                m = paddle.nn.Fold(**self.attrs)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), self.outputs['Y']))
+
+    def test_info(self):
+        str(paddle.nn.Fold(**self.attrs))
+
+
+class TestFoldOpError(unittest.TestCase):
+    def test_errors(self):
+        from paddle.nn.functional import fold
+        from paddle.fluid.framework import Program, program_guard
+        with program_guard(Program(), Program()):
+
+            def test_input_shape():
+                # input_shpae must be 3-D
+                x = paddle.randn(shape=[2, 3, 6, 7], dtype="float32")
+                out = fold(x, output_sizes=[2, 3], kernel_sizes=[2, 2])
+
+            def test_kernel_shape():
+                # kernel_size must be 2
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(x, output_sizes=[2, 3], kernel_sizes=[2, 2, 3])
+
+            def test_padding_shape():
+                # padding_size must be 2 or 4
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    paddings=[2, 2, 3])
+
+            def test_dilations_shape():
+                # dialtions_size must be 2 
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    dilations=[2, 2, 3])
+
+            def test_strides_shape():
+                # strids_size must be 2
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    strides=[2, 2, 3])
+
+            def test_output_size():
+                # im_h * im_w must be L
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x, output_sizes=[6, 6], kernel_sizes=[2, 2],
+                    strides=[1, 1])
+
+            def test_block_h_w():
+                # test_block_h_w GT 0
+                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
+                out = fold(
+                    x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1)
+
+            def test_GT_0():
+                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[0, 0],
+                    kernel_sizes=[0, 0],
+                    dilations=0,
+                    paddings=[0, 0],
+                    strides=0)
+
+            self.assertRaises(AssertionError, test_input_shape)
+            self.assertRaises(AssertionError, test_kernel_shape)
+            self.assertRaises(ValueError, test_padding_shape)
+            self.assertRaises(AssertionError, test_dilations_shape)
+            self.assertRaises(AssertionError, test_strides_shape)
+            self.assertRaises(ValueError, test_output_size)
+            self.assertRaises(ValueError, test_block_h_w)
+            self.assertRaises(ValueError, test_GT_0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index e1c40e8d0d3d7ecd480cd9ec340c365c5a2eab95..37df0d44467677bab4be06f8ff3ddb61b8acc57c 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -68,6 +68,7 @@ from .layer.common import Dropout2D  # noqa: F401
 from .layer.common import Dropout3D  # noqa: F401
 from .layer.common import AlphaDropout  # noqa: F401
 from .layer.common import Unfold  # noqa: F401
+from .layer.common import Fold  # noqa: F401
 
 from .layer.pooling import AvgPool1D  # noqa: F401
 from .layer.pooling import AvgPool2D  # noqa: F401
@@ -215,6 +216,7 @@ __all__ = [     #noqa
            'Bilinear',
            'AlphaDropout',
            'Unfold',
+           'Fold',
            'RNNCellBase',
            'SimpleRNNCell',
            'LSTMCell',
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index a504c1ee6a4febdfcc22fa561915ef9b4ef12f02..676d7259f284375030625413f270cc74b69aacf0 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -55,6 +55,7 @@ from .common import pad  # noqa: F401
 from .common import zeropad2d  # noqa: F401
 from .common import cosine_similarity  # noqa: F401
 from .common import unfold  # noqa: F401
+from .common import fold
 from .common import interpolate  # noqa: F401
 from .common import upsample  # noqa: F401
 from .common import bilinear  # noqa: F401
@@ -216,4 +217,5 @@ __all__ = [     #noqa
            'instance_norm',
            'class_center_sample',
            'sparse_attention',
+           'fold',
 ]
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 4365c52c926ea98dc23d0c10b7dfb1cc1696bc48..3dba9505e92c79b8e881c3c4beed5a910becb4d4 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1794,3 +1794,130 @@ def class_center_sample(label, num_classes, num_samples, group=None):
             'seed': seed if seed is not None else 0
         })
     return remapped_label, sampled_class_center
+
+
+def fold(x,
+         output_sizes,
+         kernel_sizes,
+         strides=1,
+         paddings=0,
+         dilations=1,
+         name=None):
+    r"""
+    
+    This Op is used to combines an array of sliding local blocks into a large containing
+    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
+    combined value in the resulting large tensor by summing all values from all containing blocks. 
+
+
+    For each input :math:`x` with shape [N, C_in , L], the output shape [N, C_out, H_out, W_out]
+    can be calculated as following.
+
+    .. math::
+
+        H_out &= output_size[0]
+        W_out &= output_size[1]
+        C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
+
+    Parameters:
+        x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
+                                  data type can be float32 or float64
+        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+                                  or an interger o treated as [o, o].
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        The tensor formed by combining a group of sliding local blocks
+        The output shape is [N, Cout, H, W] as decriabled above.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn([2,12,9])
+            y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2)
+            # y.shape = [2,3,4,4]
+
+    """
+
+    helper = LayerHelper("fold", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fold')
+
+    assert len(x.shape) == 3, \
+            "input should be the format of [N, C, L]"
+
+    if isinstance(output_sizes, int):
+        output_sizes = [output_sizes, output_sizes]
+    else:
+        assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \
+            "output_sizes should either be an integer or a list of two integers"
+
+    if isinstance(kernel_sizes, int):
+        kernel_sizes = [kernel_sizes, kernel_sizes]
+    else:
+        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list of two integers"
+
+    if isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        assert isinstance(strides, list) and (len(strides) == 2), \
+            "strides should either be an integer or a list of two integers"
+
+    if isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        assert isinstance(dilations, list) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list of two integers"
+
+    if isinstance(paddings, int):
+        paddings = [paddings] * 4
+    elif isinstance(paddings, list):
+        if len(paddings) == 2:
+            paddings = paddings * 2
+        elif len(paddings) == 4:
+            pass
+        else:
+            raise ValueError(
+                "paddings should either be an integer or a list of 2 or 4 integers"
+            )
+    else:
+        raise ValueError(
+            "Unexpected type of paddings, it should be either an integer or a list"
+            "of 2 or 4 integers")
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="fold",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs={
+            "output_sizes": output_sizes,
+            "kernel_sizes": kernel_sizes,
+            "strides": strides,
+            "paddings": paddings,
+            "dilations": dilations
+        })
+    return out
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index a78269a4cd4d7e0c6539cf2517bda3aa419a1e83..f536c3d5ff379be59ec4692c681f3d7fe056fa38 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -44,6 +44,7 @@ from .common import AlphaDropout  # noqa: F401
 from .common import Upsample  # noqa: F401
 from .common import UpsamplingBilinear2D  # noqa: F401
 from .common import UpsamplingNearest2D  # noqa: F401
+from .common import Fold
 from .pooling import AvgPool1D  # noqa: F401
 from .pooling import AvgPool2D  # noqa: F401
 from .pooling import AvgPool3D  # noqa: F401
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 1069a24be21f883ef9a232593479c123a321e4b4..22f7f798374d8ad2c597f775d3f401dd8debe4dc 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1521,7 +1521,7 @@ class Unfold(Layer):
             unfold = nn.Unfold(kernel_sizes=[3, 3])
             result = unfold(x)
             print(result)
-   """
+    """
 
     def __init__(self,
                  kernel_sizes,
@@ -1550,3 +1550,92 @@ class Unfold(Layer):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
                 format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
+
+
+class Fold(Layer):
+    """
+
+    This Op is used to combines an array of sliding local blocks into a large containing
+    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
+    combined value in the resulting large tensor by summing all values from all containing blocks. 
+
+
+    For each input :math:`x` with shape [N, C_in , L], the output shape [N, C_out, H_out, W_out]
+    can be calculated as following.
+
+    .. math::
+
+        H_out &= output_size[0]
+        W_out &= output_size[1]
+        C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
+
+    Parameters:
+        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+                                  or an interger o treated as [o, o].
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        The tensor formed by combining a group of sliding local blocks
+        The output shape is [N, Cout, H, W] as decriabled above.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            x = paddle.randn([2,12,9])
+            fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2)
+            y = fold(x)
+            # y.shape = [2,3,4,4]
+   """
+
+    def __init__(self,
+                 output_sizes,
+                 kernel_sizes,
+                 dilations=1,
+                 paddings=0,
+                 strides=1,
+                 name=None):
+        super(Fold, self).__init__()
+
+        self.output_sizes = output_sizes
+        self.kernel_sizes = kernel_sizes
+        self.dilations = dilations
+        self.paddings = paddings
+        self.strides = strides
+        self.name = name
+
+    def forward(self, input):
+        return F.fold(
+            input,
+            output_sizes=self.output_sizes,
+            kernel_sizes=self.kernel_sizes,
+            strides=self.strides,
+            paddings=self.paddings,
+            dilations=self.dilations,
+            name=self.name)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
+                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)