fix matmul double and triple grad (#48779)

* fix matmul double and triple grad * remove some comment * add matmul_double_grad unit test * fix matmul triple grad * fix dot triple grad and add unit test * modify codestyle * fix dot_grad * refactor dot triple grad * disable some unit test * fix unit test * fix unit test in double grad

fix matmul double and triple grad (#48779)
* fix matmul double and triple grad * remove some comment * add matmul_double_grad unit test * fix matmul triple grad * fix dot triple grad and add unit test * modify codestyle * fix dot_grad * refactor dot triple grad * disable some unit test * fix unit test * fix unit test in double grad
13c4fd59 · Charles-hit · GitHub · a1319074 · 13c4fd59 · 13c4fd59
6 changed file
--- a/paddle/phi/kernels/dot_grad_kernel.h
+++ b/paddle/phi/kernels/dot_grad_kernel.h
@@ -30,9 +30,9 @@ template <typename T, typename Context>
 void DotDoubleGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& y,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
                         const DenseTensor& dout,
+                         const paddle::optional<DenseTensor>& ddx_opt,
+                         const paddle::optional<DenseTensor>& ddy_opt,
                         DenseTensor* dx,
                         DenseTensor* dy,
                         DenseTensor* ddout);
@@ -41,12 +41,12 @@ template <typename T, typename Context>
 void DotTripleGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& y,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_dx,
-                         const DenseTensor& d_dy,
                         const DenseTensor& dout,
-                         const DenseTensor& d_ddout,
+                         const paddle::optional<DenseTensor>& ddx,
+                         const paddle::optional<DenseTensor>& ddy,
+                         const paddle::optional<DenseTensor>& d_dx,
+                         const paddle::optional<DenseTensor>& d_dy,
+                         const paddle::optional<DenseTensor>& d_ddout,
                         DenseTensor* d_x,
                         DenseTensor* d_y,
                         DenseTensor* d_ddx,

--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -473,27 +473,13 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            const DenseTensor& dout,
-                            const paddle::optional<DenseTensor>& ddx_opt,
-                            const paddle::optional<DenseTensor>& ddy_opt,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                            bool transpose_x,
                            bool transpose_y,
                            DenseTensor* dx,
                            DenseTensor* dy,
                            DenseTensor* ddout) {
-  paddle::optional<DenseTensor> ddx;
-  paddle::optional<DenseTensor> ddy;
-  if (!ddx_opt && (dy || ddout)) {
-    DenseTensor ddx_tmp = phi::FullLike<T, Context>(dev_ctx, x, Scalar(0.0));
-    ddx = paddle::make_optional<DenseTensor>(ddx_tmp);
-  } else {
-    ddx = ddx_opt;
-  }
-  if (!ddy_opt && (dx || ddout)) {
-    DenseTensor ddy_tmp = phi::FullLike<T, Context>(dev_ctx, y, Scalar(0.0));
-    ddy = paddle::make_optional<DenseTensor>(ddy_tmp);
-  } else {
-    ddy = ddy_opt;
-  }
  // Get dims from the input x, y, output_grad
  std::vector<std::int64_t> x_dims = vectorize(x.dims());
  std::vector<std::int64_t> y_dims = vectorize(y.dims());
@@ -506,7 +492,7 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
  // Case1 : x's or y's dim = 1
  if (x_ndim == 1 && y_ndim == 1) {
    DotDoubleGradFunction<Context, T>()(
-        dev_ctx, &x, &y, &dout, ddx.get_ptr(), ddy.get_ptr(), dx, dy, ddout);
+        dev_ctx, &x, &y, &dout, &ddx, &ddy, dx, dy, ddout);
    return;
  }

@@ -608,6 +594,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                         ddout_flag);
        ddout_flag = true;
      }
+    } else if (!ddx && dy) {
+      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
    }
    if (ddy) {
      auto ddy_mat = ddy.get();
@@ -666,6 +654,12 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                         ddout,
                         ddout_flag);
      }
+    } else if (!ddy && dx) {
+      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
+    }
+    if (ddout && !ddx && !ddy) {
+      FullLikeKernel<T, Context>(
+          dev_ctx, dout, Scalar(0.0), dout.dtype(), ddout);
    }

    if (dx) {
@@ -821,7 +815,7 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
      }
    }
    // Reduce sum to get grad by ReduceSum
-    if (dx) {
+    if (dx && dx_help.initialized()) {
      if (dx_reduce_dims.empty()) {
        *dx = std::move(dx_help);
      } else {
@@ -829,8 +823,10 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
            dev_ctx, dx_help, dx, dx_reduce_dims);
      }
      dx->Resize(x.dims());
+    } else if (dx && !dx_help.initialized()) {
+      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
    }
-    if (dy) {
+    if (dy && dy_help.initialized()) {
      if (dy_reduce_dims.empty()) {
        *dy = std::move(dy_help);
      } else {
@@ -838,6 +834,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
            dev_ctx, dy_help, dy, dy_reduce_dims);
      }
      dy->Resize(y.dims());
+    } else if (dy && !dy_help.initialized()) {
+      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
    }

    if (ddout) {
@@ -873,11 +871,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            const DenseTensor& dout,
-                            const paddle::optional<DenseTensor>& ddx_opt,
-                            const paddle::optional<DenseTensor>& ddy_opt,
-                            const paddle::optional<DenseTensor>& d_dx_opt,
-                            const paddle::optional<DenseTensor>& d_dy_opt,
-                            const paddle::optional<DenseTensor>& d_ddout_opt,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
+                            const paddle::optional<DenseTensor>& d_dx,
+                            const paddle::optional<DenseTensor>& d_dy,
+                            const paddle::optional<DenseTensor>& d_ddout,
                            bool transpose_x,
                            bool transpose_y,
                            DenseTensor* out_d_x,
@@ -885,50 +883,6 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                            DenseTensor* out_d_dout,
                            DenseTensor* out_d_ddx,
                            DenseTensor* out_d_ddy) {
-  paddle::optional<DenseTensor> ddx;
-  paddle::optional<DenseTensor> ddy;
-  paddle::optional<DenseTensor> d_dx;
-  paddle::optional<DenseTensor> d_dy;
-  paddle::optional<DenseTensor> d_ddout;
-
-  if (!ddx_opt && (out_d_y || out_d_dout)) {
-    DenseTensor ddx_tmp =
-        phi::FullLike<T, Context>(dev_ctx, x, static_cast<T>(0.0));
-    ddx = paddle::make_optional<DenseTensor>(ddx_tmp);
-  } else {
-    ddx = ddx_opt;
-  }
-  if (!ddy_opt && (out_d_x || out_d_dout)) {
-    DenseTensor ddy_tmp =
-        phi::FullLike<T, Context>(dev_ctx, y, static_cast<T>(0.0));
-    ddy = paddle::make_optional<DenseTensor>(ddy_tmp);
-  } else {
-    ddy = ddy_opt;
-  }
-
-  if (!d_ddout_opt && (out_d_y || out_d_x || out_d_ddy || out_d_ddx)) {
-    DenseTensor d_ddout_tmp =
-        phi::FullLike<T, Context>(dev_ctx, dout, static_cast<T>(0.0));
-    d_ddout = paddle::make_optional<DenseTensor>(d_ddout_tmp);
-  } else {
-    d_ddout = d_ddout_opt;
-  }
-
-  if (!d_dx_opt && (out_d_ddy || out_d_dout)) {
-    DenseTensor d_dx_tmp =
-        phi::FullLike<T, Context>(dev_ctx, x, static_cast<T>(0.0));
-    d_dx = paddle::make_optional<DenseTensor>(d_dx_tmp);
-  } else {
-    d_dx = d_dx_opt;
-  }
-
-  if (!d_dy_opt && (out_d_ddx || out_d_dout)) {
-    DenseTensor d_dy_tmp =
-        phi::FullLike<T, Context>(dev_ctx, y, static_cast<T>(0.0));
-    d_dy = paddle::make_optional<DenseTensor>(d_dy_tmp);
-  } else {
-    d_dy = d_dy_opt;
-  }
  // Get dims from the input x, y, output_grad
  std::vector<std::int64_t> x_dims = vectorize(x.dims());
  std::vector<std::int64_t> y_dims = vectorize(y.dims());
@@ -944,12 +898,12 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
    DotTripleGradFunction<Context, T>()(dev_ctx,
                                        &x,
                                        &y,
-                                        ddx.get_ptr(),
-                                        ddy.get_ptr(),
-                                        d_dx.get_ptr(),
-                                        d_dy.get_ptr(),
                                        &dout,
-                                        d_ddout.get_ptr(),
+                                        &ddx,
+                                        &ddy,
+                                        &d_dx,
+                                        &d_dy,
+                                        &d_ddout,
                                        out_d_x,
                                        out_d_y,
                                        out_d_dout,
@@ -1047,7 +1001,7 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
      if (out_d_ddy_dims != y_help.dims()) {
        out_d_ddy->Resize(y_help.dims());
      }
-      if (dout_conj.IsInitialized()) {
+      if (!dout_conj.IsInitialized()) {
        dout_conj = Conj<T>(dev_ctx, dout_help);
      }
      x_conj = Conj<T>(dev_ctx, x_help);
@@ -1108,6 +1062,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                           out_d_y,
                           false);
        }
+      } else if (out_d_y) {
+        FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
      }
      if (out_d_x && ddy) {
        if (transpose_x && transpose_y) {
@@ -1155,6 +1111,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                           out_d_x,
                           false);
        }
+      } else if (out_d_x) {
+        FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
      }

      // equations:
@@ -1269,6 +1227,15 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
        }
        d_ddy_flag = true;
      }
+    } else {
+      // d_ddout is none
+      if (out_d_x) {
+        FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
+      }
+
+      if (out_d_y) {
+        FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
+      }
    }

    if (d_dy) {
@@ -1439,6 +1406,19 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
        out_d_ddy->Resize(out_d_ddy_dims);
      }
    }
+
+    if (out_d_dout && !out_d_dout->IsInitialized()) {
+      FullLikeKernel<T, Context>(
+          dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
+    }
+
+    if (out_d_ddx && !out_d_ddx->IsInitialized()) {
+      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
+    }
+
+    if (out_d_ddy && !out_d_ddy->IsInitialized()) {
+      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
+    }
  } else {
    // Case3: broadcast. It need cost much time to reduce sum for the
    // broadcast and wastes the memory.
@@ -1585,7 +1565,7 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
    }

    // Reduce sum to get grad by ReduceSum
-    if (out_d_x) {
+    if (out_d_x && out_dx_help.initialized()) {
      if (dx_reduce_dims.empty()) {
        *out_d_x = std::move(out_dx_help);
      } else {
@@ -1593,9 +1573,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
            dev_ctx, out_dx_help, out_d_x, dx_reduce_dims);
      }
      out_d_x->Resize(x.dims());
+    } else if (out_d_x) {
+      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
    }

-    if (out_d_y) {
+    if (out_d_y && out_dy_help.initialized()) {
      if (dy_reduce_dims.empty()) {
        *out_d_y = std::move(out_dy_help);
      } else {
@@ -1603,6 +1585,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
            dev_ctx, out_dy_help, out_d_y, dy_reduce_dims);
      }
      out_d_y->Resize(y.dims());
+    } else if (out_d_y) {
+      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
    }

    // compute d_dout
@@ -1628,6 +1612,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                                   transpose_y,
                                   true);
      }
+
+      if (!out_d_dout->initialized()) {
+        FullLikeKernel<T, Context>(
+            dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
+      }
    }

    // compute d_ddx
@@ -1735,13 +1724,18 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                                     true);
        }
      }
-
-      if (dx_reduce_dims.empty()) {
-        *out_d_ddx = std::move(out_d_ddx_help);
+      if (out_d_ddx_help.initialized()) {
+        if (dx_reduce_dims.empty()) {
+          *out_d_ddx = std::move(out_d_ddx_help);
+        } else {
+          ReduceSumForMatmulGrad<Context, T>()(
+              dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims);
+        }
      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims);
+        FullLikeKernel<T, Context>(
+            dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
      }
+
      out_d_ddx->Resize(x.dims());
    }

@@ -1852,12 +1846,18 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
        }
      }

-      if (dy_reduce_dims.empty()) {
-        *out_d_ddy = std::move(out_d_ddy_help);
+      if (out_d_ddy_help.initialized()) {
+        if (dy_reduce_dims.empty()) {
+          *out_d_ddy = std::move(out_d_ddy_help);
+        } else {
+          ReduceSumForMatmulGrad<Context, T>()(
+              dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims);
+        }
      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims);
+        FullLikeKernel<T, Context>(
+            dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
      }
+
      out_d_ddy->Resize(y.dims());
    }
  }

--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -688,5 +688,489 @@ class TestDoubleGradBasics(TestCase):
        np.testing.assert_array_equal(grad_out.grad.numpy(), grad_out_grad_ref)


+class TestDygraphDoubleGradMatmul(TestCase):
+    # case1: ddy is none, no broadcast,dims != 1
+    def test_matmul_double_grad_case1(self):
+        input_numpy_x = np.random.random([3, 3]).astype('float32')
+        input_numpy_y = np.random.random([3, 3]).astype('float32')
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='float32'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='float32'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype='float32'
+            )
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype='float32'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [x, y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.zeros([3, 3], dtype="float32")
+            dy_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            ddout_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"), input_numpy_y
+            )
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+    # case2: ddx is none,no broadcast, dims != 1
+    def test_matmul_double_grad_case2(self):
+        input_numpy_x = np.random.random([3, 3]).astype('float32')
+        input_numpy_y = np.random.random([3, 3]).astype('float32')
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='float32'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='float32'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype='float32'
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype='float32'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, y, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            dy_double_grad_expected = np.zeros([3, 3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_x, np.ones([3, 3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+    # case3: ddx is none, dims = 1
+    def test_matmul_double_grad_case3(self):
+        input_numpy_x = np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype('float32')
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='float32'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='float32'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='float32'
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='float32'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, y, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            dy_double_grad_expected = np.zeros([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_x, np.ones([3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+    # case4: ddy is none, dims = 1
+    def test_matmul_double_grad_case4(self):
+        input_numpy_x = np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype('float32')
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='float32'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='float32'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='float32'
+            )
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='float32'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [x, y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.zeros([3], dtype="float32")
+            dy_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_y, np.ones([3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+    # case5: ddx is none, broadcast, dims != 1
+    def test_matmul_double_grad_case5(self):
+        input_numpy_x = np.random.random([2, 1]).astype('float32')
+        input_numpy_y = np.random.random([1]).astype('float32')
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='float32'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='float32'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([2]), stop_gradient=False, dtype='float32'
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='float32'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, y, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([2, 1], dtype="float32")
+            dy_double_grad_expected = np.zeros([1], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_x, np.ones([1], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+    # case6: ddy is none, broadcast, dims != 1
+    def test_matmul_double_grad_case6(self):
+        input_numpy_x = np.random.random([2, 1]).astype('float32')
+        input_numpy_y = np.random.random([1]).astype('float32')
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='float32'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='float32'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([2]), stop_gradient=False, dtype='float32'
+            )
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([2, 1]), stop_gradient=False, dtype='float32'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [x, y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.zeros([2, 1], dtype="float32")
+            dy_double_grad_expected = np.ones([1], dtype="float32") * 2
+            ddout_expected = np.ones([2], dtype="float32") * input_numpy_y[0]
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+    # case7: ddx is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case7(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y_conj = np.conjugate(input_numpy_y)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [x, y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.zeros(
+                [3], dtype="float32"
+            ) + 0j * np.zeros([3], dtype="float32")
+            dy_double_grad_expected = np.ones(
+                [3], dtype="float32"
+            ) + 0j * np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_y_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+    # case8: ddy is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case8(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_x_conj = np.conjugate(input_numpy_x)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, y, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            dy_double_grad_expected = np.zeros([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_x_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            places.append("gpu")
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -179,7 +179,9 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_parallel_executor_seresnext_with_reduce_gpu$|\
 ^test_api_impl$|\
 ^test_tensordot$|\
-^disable_win_inference_test$"
+^disable_win_inference_test$|\
+^test_imperative_double_grad$|\
+^test_imperative_triple_grad$"


 # /*==========Fixed Disabled Windows CPU OPENBLAS((PR-CI-Windows-OPENBLAS)) unittests==============================*/