diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 05afdf53240484212901febee431cef2b35bb75c..6dac9041b6117df8e7b93378da42c2b0e224d558 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace paddle {
@@ -178,16 +180,66 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+class MulDoubleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("DOut"), "Input(DOut) should not be null");
+
+    if (ctx->HasOutput("DX")) {
+      ctx->ShareDim("X", "DX");
+    }
+    if (ctx->HasOutput("DY")) {
+      ctx->ShareDim("Y", "DY");
+    }
+    if (ctx->HasOutput("DDOut")) {
+      ctx->ShareDim("DOut", "DDOut");
+    }
+  }
+};
+
+class MulDoubleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
+    retv->SetType("mul_grad_grad");
+
+    retv->SetInput("X", Input("X"));
+    retv->SetInput("Y", Input("Y"));
+    retv->SetInput("DOut", Input(framework::GradVarName("Out")));
+    retv->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
+    retv->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
+
+    retv->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
+    retv->SetOutput("DX", InputGrad("X"));
+    retv->SetOutput("DY", InputGrad("Y"));
+
+    retv->SetAttrMap(Attrs());
+    return retv;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType,
                   ops::MulOpGradMaker);
-REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
+REGISTER_OPERATOR(mul_grad, ops::MulGradOp, ops::MulDoubleGradMaker);
+REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mul_grad_grad,
+    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
index 6c5a83c6a50c463502171f09bbf18e17e43917b5..6e841712b9bffc06ca56afddcb866af8b3f9b0d8 100644
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -24,3 +24,7 @@ REGISTER_OP_CUDA_KERNEL(
     mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
     ops::MulGradKernel<plat::CUDADeviceContext, double>,
     ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    mul_grad_grad,
+    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index f72824806ed6ee3a4490938403d441326f8a3d4a..c77eb5c4ccbff1900e705bf51f5c5fc9096aa4eb 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -109,5 +110,95 @@ class MulGradKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class MulDoubleGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_mat = x->dims().size() > 2
+                     ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                     : static_cast<const Tensor&>(*x);
+    auto y_mat = y->dims().size() > 2
+                     ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                     : static_cast<const Tensor&>(*y);
+
+    const int m = framework::flatten_to_2d(x->dims(), x_num_col_dims)[0];
+    const int n = framework::flatten_to_2d(y->dims(), y_num_col_dims)[1];
+
+    auto* dout = ctx.Input<framework::LoDTensor>("DOut");
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize({m, n});
+
+    auto* ddx = ctx.Input<framework::LoDTensor>("DDX");
+    auto* ddy = ctx.Input<framework::LoDTensor>("DDY");
+
+    auto* dx = ctx.Output<framework::LoDTensor>("DX");
+    auto* dy = ctx.Output<framework::LoDTensor>("DY");
+    auto* ddout = ctx.Output<framework::LoDTensor>("DDOut");
+
+    Tensor ddout_mat;
+    if (ddout) {
+      ddout->set_lod(dout->lod());
+      // allocate and reshape ddout
+      ddout->mutable_data<T>(ctx.GetPlace());
+      ddout_mat.ShareDataWith(*ddout);
+      ddout_mat.Resize({m, n});
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    // a flag to specify whether ddout value has been set, if flag
+    // is false, MatMul beta should be 0 to set ddout, if flag is
+    // true, MatMul beta should be 1 to add result to ddout.
+    bool ddout_flag = false;
+    if (ddx) {
+      auto ddx_mat = ddx->dims().size() > 2
+                         ? framework::ReshapeToMatrix(*ddx, x_num_col_dims)
+                         : static_cast<const Tensor&>(*ddx);
+
+      // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
+      if (dy) {
+        dy->set_lod(y->lod());
+        // allocate and reshape dy
+        dy->mutable_data<T>(ctx.GetPlace());
+        Tensor dy_mat = dy->dims().size() > 2
+                            ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                            : *dy;
+        blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
+      }
+      // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
+      if (ddout) {
+        blas.MatMul(ddx_mat, false, y_mat, false, static_cast<T>(1.0),
+                    &ddout_mat, static_cast<T>(ddout_flag));
+        ddout_flag = true;
+      }
+    }
+    if (ddy) {
+      auto ddy_mat = ddy->dims().size() > 2
+                         ? framework::ReshapeToMatrix(*ddy, y_num_col_dims)
+                         : static_cast<const Tensor&>(*ddy);
+      // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
+      if (dx) {
+        dx->set_lod(x->lod());
+        // allocate and reshape dx
+        dx->mutable_data<T>(ctx.GetPlace());
+        Tensor dx_mat = dx->dims().size() > 2
+                            ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                            : *dx;
+        blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
+      }
+      // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
+      if (ddout) {
+        blas.MatMul(x_mat, false, ddy_mat, false, static_cast<T>(1.0),
+                    &ddout_mat, static_cast<T>(ddout_flag));
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index dbfdca0e4436720d67a8fc2efbe3e0ea34444463..2ef722c9133e1c8306074705384488a1c1f28a10 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -193,5 +193,33 @@ class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestMulDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        x_shape = [7, 11]
+        y_shape = [11, 9]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        y = layers.data('y', y_shape, False, dtype)
+        y.persistable = True
+        out = layers.mul(x, y)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, y_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()