From 13c4fd59091f7f404eaae0a59f19fce3f50621f0 Mon Sep 17 00:00:00 2001 From: Charles-hit <56987902+Charles-hit@users.noreply.github.com> Date: Fri, 23 Dec 2022 17:00:41 +0800 Subject: [PATCH] fix matmul double and triple grad (#48779) * fix matmul double and triple grad * remove some comment * add matmul_double_grad unit test * fix matmul triple grad * fix dot triple grad and add unit test * modify codestyle * fix dot_grad * refactor dot triple grad * disable some unit test * fix unit test * fix unit test in double grad --- paddle/phi/kernels/dot_grad_kernel.h | 14 +- .../phi/kernels/impl/dot_grad_kernel_impl.h | 1187 ++++++++++++----- .../kernels/impl/matmul_grad_kernel_impl.h | 170 +-- .../unittests/test_imperative_double_grad.py | 484 +++++++ .../unittests/test_imperative_triple_grad.py | 1015 ++++++++++++++ tools/windows/run_unittests.sh | 4 +- 6 files changed, 2427 insertions(+), 447 deletions(-) diff --git a/paddle/phi/kernels/dot_grad_kernel.h b/paddle/phi/kernels/dot_grad_kernel.h index 3e7f478878..6727476341 100644 --- a/paddle/phi/kernels/dot_grad_kernel.h +++ b/paddle/phi/kernels/dot_grad_kernel.h @@ -30,9 +30,9 @@ template void DotDoubleGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - const DenseTensor& ddx, - const DenseTensor& ddy, const DenseTensor& dout, + const paddle::optional& ddx_opt, + const paddle::optional& ddy_opt, DenseTensor* dx, DenseTensor* dy, DenseTensor* ddout); @@ -41,12 +41,12 @@ template void DotTripleGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - const DenseTensor& ddx, - const DenseTensor& ddy, - const DenseTensor& d_dx, - const DenseTensor& d_dy, const DenseTensor& dout, - const DenseTensor& d_ddout, + const paddle::optional& ddx, + const paddle::optional& ddy, + const paddle::optional& d_dx, + const paddle::optional& d_dy, + const paddle::optional& d_ddout, DenseTensor* d_x, DenseTensor* d_y, DenseTensor* d_ddx, diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h index 8987d22152..a9b6b27d0e 100644 --- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" @@ -207,8 +209,8 @@ struct DotDoubleGradFunction { const DenseTensor* tensor_x, const DenseTensor* tensor_y, const DenseTensor* tensor_dout, - const DenseTensor* tensor_ddx, - const DenseTensor* tensor_ddy, + const paddle::optional* tensor_ddx_opt, + const paddle::optional* tensor_ddy_opt, DenseTensor* tensor_dx, DenseTensor* tensor_dy, DenseTensor* tensor_ddout); @@ -220,11 +222,13 @@ struct DotDoubleGradFunction> { const DenseTensor* tensor_x, const DenseTensor* tensor_y, const DenseTensor* tensor_dout, - const DenseTensor* tensor_ddx, - const DenseTensor* tensor_ddy, + const paddle::optional* tensor_ddx_opt, + const paddle::optional* tensor_ddy_opt, DenseTensor* tensor_dx, DenseTensor* tensor_dy, DenseTensor* tensor_ddout) { + const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr(); + const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr(); #if defined(__NVCC__) || defined(__HIPCC__) if (1 == tensor_dout->dims().size()) { DenseTensor tensor_dout_help; @@ -232,23 +236,32 @@ struct DotDoubleGradFunction> { if (tensor_dx || tensor_dy) { tensor_dout_help = Conj(ctx, *tensor_dout); } - if (tensor_dx) { + if (tensor_dx && tensor_ddy) { + ctx.template Alloc(tensor_dx); auto ddy = EigenVector::Flatten(*tensor_ddy); Eigen::DSizes size(tensor_ddy->numel()); auto dx = EigenVector::Flatten(*tensor_dx); auto dout = EigenVector::Flatten(tensor_dout_help); dx.device(dev) = ddy * dout.broadcast(size); + } else if (tensor_dx && !tensor_ddy) { + FullLikeKernel( + ctx, *tensor_x, Scalar(T(0.0, 0.0)), tensor_x->dtype(), tensor_dx); } - if (tensor_dy) { + if (tensor_dy && tensor_ddx) { + ctx.template Alloc(tensor_dy); auto ddx = EigenVector::Flatten(*tensor_ddx); Eigen::DSizes size(tensor_ddx->numel()); auto dy = EigenVector::Flatten(*tensor_dy); auto dout = EigenVector::Flatten(tensor_dout_help); dy.device(dev) = ddx * dout.broadcast(size); + } else if (tensor_dy && !tensor_ddx) { + FullLikeKernel( + ctx, *tensor_y, Scalar(T(0.0, 0.0)), tensor_y->dtype(), tensor_dy); } - if (tensor_ddout) { + if (tensor_ddout && tensor_ddx && tensor_ddy) { + ctx.template Alloc(tensor_ddout); DenseTensor tensor_x_help = Conj(ctx, *tensor_x); DenseTensor tensor_y_help = Conj(ctx, *tensor_y); @@ -258,12 +271,28 @@ struct DotDoubleGradFunction> { auto ddy = EigenVector::Flatten(*tensor_ddy); auto ddout = EigenVector::Flatten(*tensor_ddout); ddout.device(dev) = (x * ddy + y * ddx).sum(); + } else if (tensor_ddout && tensor_ddx && !tensor_ddy) { + ctx.template Alloc(tensor_ddout); + DenseTensor tensor_y_help = Conj(ctx, *tensor_y); + + auto y = EigenVector::Flatten(tensor_y_help); + auto ddx = EigenVector::Flatten(*tensor_ddx); + auto ddout = EigenVector::Flatten(*tensor_ddout); + ddout.device(dev) = (y * ddx).sum(); + } else if (tensor_ddout && !tensor_ddx && tensor_ddy) { + ctx.template Alloc(tensor_ddout); + DenseTensor tensor_x_help = Conj(ctx, *tensor_x); + + auto x = EigenVector::Flatten(tensor_x_help); + auto ddy = EigenVector::Flatten(*tensor_ddy); + auto ddout = EigenVector::Flatten(*tensor_ddout); + ddout.device(dev) = (x * ddy).sum(); } } #else const auto* data_dout = tensor_dout->data(); - if (tensor_dx) { + if (tensor_dx && tensor_ddy) { auto* data_dx = ctx.template Alloc(tensor_dx); const auto* data_ddy = tensor_ddy->data(); const DDim& dim = tensor_dx->dims(); @@ -276,9 +305,12 @@ struct DotDoubleGradFunction> { if (0 == i % step) ++s; data_dx[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddy[i]; } + } else if (tensor_dx && !tensor_ddy) { + FullLikeKernel( + ctx, *tensor_x, Scalar(T(0.0, 0.0)), tensor_x->dtype(), tensor_dx); } - if (tensor_dy) { + if (tensor_dy && tensor_ddx) { auto* data_dy = ctx.template Alloc(tensor_dy); const auto* data_ddx = tensor_ddx->data(); const DDim& dim = tensor_dy->dims(); @@ -291,9 +323,12 @@ struct DotDoubleGradFunction> { if (0 == i % step) ++s; data_dy[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddx[i]; } + } else if (tensor_dy && !tensor_ddx) { + FullLikeKernel( + ctx, *tensor_y, Scalar(T(0.0, 0.0)), tensor_y->dtype(), tensor_dy); } - if (tensor_ddout) { + if (tensor_ddout && tensor_ddx && tensor_ddy) { auto* data_ddout = ctx.template Alloc(tensor_ddout); auto* data_x = tensor_x->data(); auto* data_y = tensor_y->data(); @@ -320,6 +355,52 @@ struct DotDoubleGradFunction> { } new_s = false; } + } else if (tensor_ddout && tensor_ddx && !tensor_ddy) { + auto* data_ddout = ctx.template Alloc(tensor_ddout); + auto* data_y = tensor_y->data(); + auto* data_ddx = tensor_ddx->data(); + + const DDim& dim = tensor_dy->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_ddout[s] = T(data_y[i].real, -data_y[i].imag) * data_ddx[i]; + } else { + data_ddout[s] += T(data_y[i].real, -data_y[i].imag) * data_ddx[i]; + } + new_s = false; + } + } else if (tensor_ddout && !tensor_ddx && tensor_ddy) { + auto* data_ddout = ctx.template Alloc(tensor_ddout); + auto* data_x = tensor_x->data(); + auto* data_ddy = tensor_ddy->data(); + + const DDim& dim = tensor_dx->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_ddout[s] = T(data_x[i].real, -data_x[i].imag) * data_ddy[i]; + } else { + data_ddout[s] += T(data_x[i].real, -data_x[i].imag) * data_ddy[i]; + } + new_s = false; + } } #endif } @@ -331,88 +412,102 @@ struct DotDoubleGradFunction> { const DenseTensor* tensor_x, const DenseTensor* tensor_y, const DenseTensor* tensor_dout, - const DenseTensor* tensor_ddx, - const DenseTensor* tensor_ddy, + const paddle::optional* tensor_ddx_opt, + const paddle::optional* tensor_ddy_opt, DenseTensor* tensor_dx, DenseTensor* tensor_dy, DenseTensor* tensor_ddout) { + const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr(); + const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr(); #if defined(__NVCC__) || defined(__HIPCC__) if (1 == tensor_dout->dims().size()) { auto& dev = *ctx.eigen_device(); + auto x = EigenVector::Flatten(*tensor_x); + auto y = EigenVector::Flatten(*tensor_y); auto dout = EigenVector::Flatten(*tensor_dout); - if (tensor_dx) { + if (tensor_dx && tensor_ddy) { ctx.template Alloc(tensor_dx); auto ddy = EigenVector::Flatten(*tensor_ddy); Eigen::DSizes size(tensor_ddy->numel()); auto dx = EigenVector::Flatten(*tensor_dx); dx.device(dev) = ddy * dout.broadcast(size); + } else if (tensor_dx && !tensor_ddy) { + FullLikeKernel( + ctx, *tensor_x, Scalar(0.0), tensor_x->dtype(), tensor_dx); } - if (tensor_dy) { + if (tensor_dy && tensor_ddx) { ctx.template Alloc(tensor_dy); auto ddx = EigenVector::Flatten(*tensor_ddx); Eigen::DSizes size(tensor_ddx->numel()); - auto dy = EigenVector::Flatten(*tensor_dy); dy.device(dev) = ddx * dout.broadcast(size); + } else if (tensor_dy && !tensor_ddx) { + FullLikeKernel( + ctx, *tensor_y, Scalar(0.0), tensor_y->dtype(), tensor_dy); } - if (tensor_ddout) { + if (tensor_ddout && tensor_ddx && tensor_ddy) { ctx.template Alloc(tensor_ddout); - auto x = EigenVector::Flatten(*tensor_x); - auto y = EigenVector::Flatten(*tensor_y); auto ddx = EigenVector::Flatten(*tensor_ddx); auto ddy = EigenVector::Flatten(*tensor_ddy); auto ddout = EigenVector::Flatten(*tensor_ddout); ddout.device(dev) = (x * ddy + y * ddx).sum(); + } else if (tensor_ddout && tensor_ddx && !tensor_ddy) { + ctx.template Alloc(tensor_ddout); + auto ddx = EigenVector::Flatten(*tensor_ddx); + auto ddout = EigenVector::Flatten(*tensor_ddout); + ddout.device(dev) = (y * ddx).sum(); + } else if (tensor_ddout && !tensor_ddx && tensor_ddy) { + ctx.template Alloc(tensor_ddout); + auto ddy = EigenVector::Flatten(*tensor_ddy); + auto ddout = EigenVector::Flatten(*tensor_ddout); + ddout.device(dev) = (x * ddy).sum(); } } #else - const auto* data_dout = tensor_dout->data(); - - if (tensor_dx) { + const T* data_x = tensor_x->data(); + const T* data_y = tensor_y->data(); + const T* data_dout = tensor_dout->data(); + const T* data_ddx = tensor_ddx ? tensor_ddx->data() : nullptr; + const T* data_ddy = tensor_ddy ? tensor_ddy->data() : nullptr; + if (tensor_dx && tensor_ddy) { auto* data_dx = ctx.template Alloc(tensor_dx); - const auto* data_ddy = tensor_ddy->data(); const DDim& dim = tensor_dx->dims(); size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; for (size_t i = 0; i < N; ++i) { if (0 == i % step) ++s; data_dx[i] = data_dout[s] * data_ddy[i]; } + } else if (tensor_dx && !tensor_ddy) { + FullLikeKernel( + ctx, *tensor_x, Scalar(0.0), tensor_x->dtype(), tensor_dx); } - if (tensor_dy) { + if (tensor_dy && tensor_ddx) { auto* data_dy = ctx.template Alloc(tensor_dy); - const auto* data_ddx = tensor_ddx->data(); const DDim& dim = tensor_dy->dims(); size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; for (size_t i = 0; i < N; ++i) { if (0 == i % step) ++s; data_dy[i] = data_dout[s] * data_ddx[i]; } + } else if (tensor_dy) { + FullLikeKernel( + ctx, *tensor_y, Scalar(0.0), tensor_y->dtype(), tensor_dy); } - if (tensor_ddout) { + if (tensor_ddout && tensor_ddx && tensor_ddy) { auto* data_ddout = ctx.template Alloc(tensor_ddout); - auto* data_x = tensor_x->data(); - auto* data_y = tensor_y->data(); - auto* data_ddx = tensor_ddx->data(); - auto* data_ddy = tensor_ddy->data(); - const DDim& dim = tensor_dy->dims(); size_t N = static_cast(product(dim)); auto step = dim[dim.size() - 1]; int s = -1; bool new_s = false; - for (size_t i = 0; i < N; ++i) { if (0 == i % step) { ++s; @@ -425,6 +520,44 @@ struct DotDoubleGradFunction> { } new_s = false; } + } else if (tensor_ddout && tensor_ddx && !tensor_ddy) { + auto* data_ddout = ctx.template Alloc(tensor_ddout); + const DDim& dim = tensor_dy->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_ddout[s] = data_y[i] * data_ddx[i]; + } else { + data_ddout[s] += data_y[i] * data_ddx[i]; + } + new_s = false; + } + } else if (tensor_ddout && !tensor_ddx && tensor_ddy) { + auto* data_ddout = ctx.template Alloc(tensor_ddout); + const DDim& dim = tensor_dx->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_ddout[s] = data_x[i] * data_ddy[i]; + } else { + data_ddout[s] += data_x[i] * data_ddy[i]; + } + new_s = false; + } } #endif } @@ -435,12 +568,12 @@ struct DotTripleGradFunction { void operator()(const DeviceContext& ctx, const DenseTensor* in_tensor_x, const DenseTensor* in_tensor_y, - const DenseTensor* in_tensor_ddx, - const DenseTensor* in_tensor_ddy, - const DenseTensor* in_tensor_d_dx, - const DenseTensor* in_tensor_d_dy, const DenseTensor* in_tensor_dout, - const DenseTensor* in_tensor_d_ddout, + const paddle::optional* in_tensor_ddx_opt, + const paddle::optional* in_tensor_ddy_opt, + const paddle::optional* in_tensor_d_dx_opt, + const paddle::optional* in_tensor_d_dy_opt, + const paddle::optional* in_tensor_d_ddout_opt, DenseTensor* out_tensor_d_x, DenseTensor* out_tensor_d_y, DenseTensor* out_tensor_d_dout, @@ -455,190 +588,383 @@ struct DotTripleGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* in_tensor_x, const DenseTensor* in_tensor_y, - const DenseTensor* in_tensor_ddx, - const DenseTensor* in_tensor_ddy, - const DenseTensor* in_tensor_d_dx, - const DenseTensor* in_tensor_d_dy, const DenseTensor* in_tensor_dout, - const DenseTensor* in_tensor_d_ddout, + const paddle::optional* in_tensor_ddx_opt, + const paddle::optional* in_tensor_ddy_opt, + const paddle::optional* in_tensor_d_dx_opt, + const paddle::optional* in_tensor_d_dy_opt, + const paddle::optional* in_tensor_d_ddout_opt, DenseTensor* out_tensor_d_x, DenseTensor* out_tensor_d_y, DenseTensor* out_tensor_d_dout, DenseTensor* out_tensor_d_ddx, DenseTensor* out_tensor_d_ddy) { + const DenseTensor* in_tensor_ddx = in_tensor_ddx_opt->get_ptr(); + const DenseTensor* in_tensor_ddy = in_tensor_ddy_opt->get_ptr(); + const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr(); + const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr(); + const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr(); #if defined(__NVCC__) || defined(__HIPCC__) - if (1 == in_tensor_d_ddout->dims().size()) { - DenseTensor in_tensor_d_ddout_help; + if (1 == in_tensor_dout->dims().size()) { auto& dev = *ctx.eigen_device(); - if (out_tensor_d_x || out_tensor_d_y) { - in_tensor_d_ddout_help = - Conj(ctx, *in_tensor_d_ddout); + DenseTensor in_tensor_x_help = Conj(ctx, *in_tensor_x); + DenseTensor in_tensor_y_help = Conj(ctx, *in_tensor_y); + DenseTensor in_tensor_dout_help = + Conj(ctx, *in_tensor_dout); + DenseTensor in_tensor_ddx_help; + DenseTensor in_tensor_ddy_help; + if (in_tensor_ddx) { + in_tensor_ddx_help = Conj(ctx, *in_tensor_ddx); } - if (out_tensor_d_x) { - auto ddy = EigenVector::Flatten(*in_tensor_ddy); - Eigen::DSizes size(in_tensor_ddy->numel()); - auto d_x = EigenVector::Flatten(*out_tensor_d_x); - auto d_ddout = EigenVector::Flatten(in_tensor_d_ddout_help); - d_x.device(dev) = ddy * d_ddout.broadcast(size); + if (in_tensor_ddy) { + in_tensor_ddy_help = Conj(ctx, *in_tensor_ddy); } - if (out_tensor_d_y) { - auto ddx = EigenVector::Flatten(*in_tensor_ddx); - Eigen::DSizes size(in_tensor_ddx->numel()); - auto d_y = EigenVector::Flatten(*out_tensor_d_y); - auto d_ddout = EigenVector::Flatten(in_tensor_d_ddout_help); - d_y.device(dev) = ddx * d_ddout.broadcast(size); + bool d_dout_flag = false; + bool d_ddx_flag = false; + bool d_ddy_flag = false; + + if (in_tensor_ddx) { + if (out_tensor_d_y && in_tensor_d_ddout) { + ctx.template Alloc(out_tensor_d_y); + auto ddx = EigenVector::Flatten(in_tensor_ddx_help); + Eigen::DSizes size(in_tensor_ddx->numel()); + auto d_y = EigenVector::Flatten(*out_tensor_d_y); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + d_y.device(dev) = ddx * d_ddout.broadcast(size); + } + if (out_tensor_d_dout && in_tensor_d_dy) { + ctx.template Alloc(out_tensor_d_dout); + auto ddx = EigenVector::Flatten(in_tensor_ddx_help); + auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); + auto d_dout = EigenVector::Flatten(*out_tensor_d_dout); + d_dout.device(dev) = (ddx * d_dy).sum(); + d_dout_flag = true; + } } - if (out_tensor_d_dout) { - DenseTensor in_tensor_ddx_help = - Conj(ctx, *in_tensor_ddx); - DenseTensor in_tensor_ddy_help = - Conj(ctx, *in_tensor_ddy); - - auto ddx = EigenVector::Flatten(in_tensor_ddx_help); - auto ddy = EigenVector::Flatten(in_tensor_ddy_help); - auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); - auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); - auto d_dout = EigenVector::Flatten(*out_tensor_d_dout); - d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum(); + if (in_tensor_ddy) { + if (out_tensor_d_x && in_tensor_d_ddout) { + ctx.template Alloc(out_tensor_d_x); + auto ddy = EigenVector::Flatten(in_tensor_ddy_help); + Eigen::DSizes size(in_tensor_ddy->numel()); + auto d_x = EigenVector::Flatten(*out_tensor_d_x); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + d_x.device(dev) = ddy * d_ddout.broadcast(size); + } + if (out_tensor_d_dout && in_tensor_d_dx) { + ctx.template Alloc(out_tensor_d_dout); + auto ddy = EigenVector::Flatten(in_tensor_ddy_help); + auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); + auto d_dout = EigenVector::Flatten(*out_tensor_d_dout); + if (d_dout_flag) { + d_dout.device(dev) += (ddy * d_dx).sum(); + } else { + d_dout.device(dev) = (ddy * d_dx).sum(); + } + } } - if (out_tensor_d_ddx) { - DenseTensor in_tensor_dout_help = - Conj(ctx, *in_tensor_dout); - DenseTensor in_tensor_y_help = - Conj(ctx, *in_tensor_y); - - auto dout = EigenVector::Flatten(in_tensor_dout_help); - auto y = EigenVector::Flatten(in_tensor_y_help); - auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); - auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); - auto d_ddx = EigenVector::Flatten(*out_tensor_d_ddx); - Eigen::DSizes size(in_tensor_y->numel()); - d_ddx.device(dev) = - (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size)); + if (in_tensor_d_dx) { + if (out_tensor_d_ddy) { + ctx.template Alloc(out_tensor_d_ddy); + auto dout = EigenVector::Flatten(in_tensor_dout_help); + auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); + auto d_ddy = EigenVector::Flatten(*out_tensor_d_ddy); + Eigen::DSizes size(in_tensor_x->numel()); + d_ddy.device(dev) = (dout.broadcast(size) * d_dx); + d_ddy_flag = true; + } } - if (out_tensor_d_ddy) { - DenseTensor in_tensor_dout_help = - Conj(ctx, *in_tensor_dout); - DenseTensor in_tensor_x_help = - Conj(ctx, *in_tensor_x); + if (in_tensor_d_dy) { + if (out_tensor_d_ddx) { + ctx.template Alloc(out_tensor_d_ddx); + auto dout = EigenVector::Flatten(in_tensor_dout_help); + auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); + auto d_ddx = EigenVector::Flatten(*out_tensor_d_ddx); + Eigen::DSizes size(in_tensor_y->numel()); + d_ddx.device(dev) = (dout.broadcast(size) * d_dy); + d_ddx_flag = true; + } + } - auto dout = EigenVector::Flatten(in_tensor_dout_help); - auto x = EigenVector::Flatten(in_tensor_x_help); - auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); - auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); - auto d_ddy = EigenVector::Flatten(*out_tensor_d_ddy); - Eigen::DSizes size(in_tensor_x->numel()); - d_ddy.device(dev) = - (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size)); + if (in_tensor_d_ddout) { + if (out_tensor_d_ddx) { + ctx.template Alloc(out_tensor_d_ddx); + auto y = EigenVector::Flatten(in_tensor_y_help); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + Eigen::DSizes size(in_tensor_y->numel()); + auto d_ddx = EigenVector::Flatten(*out_tensor_d_ddx); + if (d_ddx_flag) { + d_ddx.device(dev) += (y * d_ddout.broadcast(size)); + } else { + d_ddx.device(dev) = (y * d_ddout.broadcast(size)); + } + } + if (out_tensor_d_ddy) { + ctx.template Alloc(out_tensor_d_ddy); + auto x = EigenVector::Flatten(in_tensor_x_help); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + Eigen::DSizes size(in_tensor_x->numel()); + auto d_ddy = EigenVector::Flatten(*out_tensor_d_ddy); + if (d_ddy_flag) { + d_ddy.device(dev) += (x * d_ddout.broadcast(size)); + } else { + d_ddy.device(dev) = (x * d_ddout.broadcast(size)); + } + } + } + if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_x, + Scalar(T(0.0, 0.0)), + in_tensor_x->dtype(), + out_tensor_d_x); + } + if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_y, + Scalar(T(0.0, 0.0)), + in_tensor_y->dtype(), + out_tensor_d_y); + } + if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_dout, + Scalar(T(0.0, 0.0)), + in_tensor_dout->dtype(), + out_tensor_d_dout); + } + if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_x, + Scalar(T(0.0, 0.0)), + in_tensor_x->dtype(), + out_tensor_d_ddx); + } + if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_y, + Scalar(T(0.0, 0.0)), + in_tensor_y->dtype(), + out_tensor_d_ddy); } } #else - const auto* data_d_ddout = in_tensor_d_ddout->data(); - - if (out_tensor_d_x) { - auto* data_d_x = ctx.template Alloc(out_tensor_d_x); - const auto* data_ddy = in_tensor_ddy->data(); - - const DDim& dim = out_tensor_d_x->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_x[i] = T(data_ddy[i].real, -data_ddy[i].imag) * data_d_ddout[s]; + const T* data_x = in_tensor_x->data(); + const T* data_y = in_tensor_y->data(); + const T* data_dout = in_tensor_dout->data(); + const T* data_ddx = in_tensor_ddx ? in_tensor_ddx->data() : nullptr; + const T* data_ddy = in_tensor_ddy ? in_tensor_ddy->data() : nullptr; + const T* data_d_dx = in_tensor_d_dx ? in_tensor_d_dx->data() : nullptr; + const T* data_d_dy = in_tensor_d_dy ? in_tensor_d_dy->data() : nullptr; + const T* data_d_ddout = + in_tensor_d_ddout ? in_tensor_d_ddout->data() : nullptr; + + bool d_dout_flag = false; + bool d_ddx_flag = false; + bool d_ddy_flag = false; + + if (data_ddx) { + if (out_tensor_d_y && data_d_ddout) { + auto* data_d_y = ctx.template Alloc(out_tensor_d_y); + const DDim& dim = out_tensor_d_y->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_y[i] = + T(data_ddx[i].real, -data_ddx[i].imag) * data_d_ddout[s]; + } } - } - - if (out_tensor_d_y) { - auto* data_d_y = ctx.template Alloc(out_tensor_d_y); - const auto* data_ddx = in_tensor_ddx->data(); - const DDim& dim = out_tensor_d_y->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_y[i] = T(data_ddx[i].real, -data_ddx[i].imag) * data_d_ddout[s]; + if (out_tensor_d_dout && data_d_dy) { + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); + const DDim& dim = in_tensor_x->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_d_dout[s] = + T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i]; + } else { + data_d_dout[s] += + T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i]; + } + new_s = false; + } + d_dout_flag = true; } } - if (out_tensor_d_dout) { - auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); - auto* data_ddx = in_tensor_ddx->data(); - auto* data_ddy = in_tensor_ddy->data(); - auto* data_d_dx = in_tensor_d_dx->data(); - auto* data_d_dy = in_tensor_d_dy->data(); - - const DDim& dim = out_tensor_d_dout->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - bool new_s = false; - - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) { - ++s; - new_s = true; + if (data_ddy) { + if (out_tensor_d_x && data_d_ddout) { + auto* data_d_x = ctx.template Alloc(out_tensor_d_x); + const DDim& dim = out_tensor_d_x->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_x[i] = + T(data_ddy[i].real, -data_ddy[i].imag) * data_d_ddout[s]; } - if (new_s) { - data_d_dout[s] = - T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] + - T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i]; + } + if (out_tensor_d_dout && data_d_dx) { + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); + const DDim& dim = in_tensor_x->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + if (d_dout_flag) { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + } + data_d_dout[s] += + T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i]; + } } else { - data_d_dout[s] += - T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] + - T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i]; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_d_dout[s] = + T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i]; + } else { + data_d_dout[s] += + T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i]; + } + new_s = false; + } } - new_s = false; } } - if (out_tensor_d_ddx) { - auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); - auto* data_dout = in_tensor_dout->data(); - auto* data_d_dy = in_tensor_d_dy->data(); - auto* data_y = in_tensor_y->data(); - auto* data_d_ddout = in_tensor_d_ddout->data(); - - const DDim& dim = out_tensor_d_ddx->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_ddx[i] = - T(data_dout[s].real, -data_dout[s].imag) * data_d_dy[i] + - T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s]; + if (data_d_dx) { + if (out_tensor_d_ddy) { + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); + const DDim& dim = out_tensor_d_ddy->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddy[i] = + T(data_dout[s].real, -data_dout[s].imag) * data_d_dx[i]; + } + d_ddy_flag = true; } } - if (out_tensor_d_ddy) { - auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); - auto* data_dout = in_tensor_dout->data(); - auto* data_d_dx = in_tensor_d_dx->data(); - auto* data_x = in_tensor_x->data(); - auto* data_d_ddout = in_tensor_d_ddout->data(); - - const DDim& dim = out_tensor_d_ddy->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; + if (data_d_dy) { + if (out_tensor_d_ddx) { + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); + const DDim& dim = out_tensor_d_ddx->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddx[i] = + T(data_dout[s].real, -data_dout[s].imag) * data_d_dy[i]; + } + } + d_ddx_flag = true; + } - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_ddy[i] = - T(data_dout[s].real, -data_dout[s].imag) * data_d_dx[i] + - T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s]; + if (data_d_ddout) { + if (out_tensor_d_ddx) { + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); + const DDim& dim = out_tensor_d_ddx->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + if (d_ddx_flag) { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddx[i] += + T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s]; + } + } else { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddx[i] = + T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s]; + } + } + } + if (out_tensor_d_ddy) { + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); + const DDim& dim = out_tensor_d_ddy->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + if (d_ddy_flag) { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddy[i] += + T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s]; + } + } else { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddy[i] = + T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s]; + } + } } } + + if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_x, + Scalar(T(0.0, 0.0)), + in_tensor_x->dtype(), + out_tensor_d_x); + } + if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_y, + Scalar(T(0.0, 0.0)), + in_tensor_y->dtype(), + out_tensor_d_y); + } + if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_dout, + Scalar(T(0.0, 0.0)), + in_tensor_dout->dtype(), + out_tensor_d_dout); + } + if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_x, + Scalar(T(0.0, 0.0)), + in_tensor_x->dtype(), + out_tensor_d_ddx); + } + if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_y, + Scalar(T(0.0, 0.0)), + in_tensor_y->dtype(), + out_tensor_d_ddy); + } + #endif } }; @@ -648,170 +974,348 @@ struct DotTripleGradFunction> { void operator()(const DeviceContext& ctx, const DenseTensor* in_tensor_x, const DenseTensor* in_tensor_y, - const DenseTensor* in_tensor_ddx, - const DenseTensor* in_tensor_ddy, - const DenseTensor* in_tensor_d_dx, - const DenseTensor* in_tensor_d_dy, const DenseTensor* in_tensor_dout, - const DenseTensor* in_tensor_d_ddout, + const paddle::optional* in_tensor_ddx_opt, + const paddle::optional* in_tensor_ddy_opt, + const paddle::optional* in_tensor_d_dx_opt, + const paddle::optional* in_tensor_d_dy_opt, + const paddle::optional* in_tensor_d_ddout_opt, DenseTensor* out_tensor_d_x, DenseTensor* out_tensor_d_y, DenseTensor* out_tensor_d_dout, DenseTensor* out_tensor_d_ddx, DenseTensor* out_tensor_d_ddy) { + const DenseTensor* in_tensor_ddx = in_tensor_ddx_opt->get_ptr(); + const DenseTensor* in_tensor_ddy = in_tensor_ddy_opt->get_ptr(); + const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr(); + const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr(); + const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr(); #if defined(__NVCC__) || defined(__HIPCC__) - if (1 == in_tensor_d_ddout->dims().size()) { + if (1 == in_tensor_dout->dims().size()) { auto& dev = *ctx.eigen_device(); - auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); - if (out_tensor_d_x) { - ctx.template Alloc(out_tensor_d_x); - auto ddy = EigenVector::Flatten(*in_tensor_ddy); - Eigen::DSizes size(in_tensor_ddy->numel()); - auto d_x = EigenVector::Flatten(*out_tensor_d_x); - d_x.device(dev) = ddy * d_ddout.broadcast(size); + bool d_dout_flag = false; + bool d_ddx_flag = false; + bool d_ddy_flag = false; + + if (in_tensor_ddx) { + if (out_tensor_d_y && in_tensor_d_ddout) { + ctx.template Alloc(out_tensor_d_y); + auto ddx = EigenVector::Flatten(*in_tensor_ddx); + Eigen::DSizes size(in_tensor_ddx->numel()); + auto d_y = EigenVector::Flatten(*out_tensor_d_y); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + d_y.device(dev) = ddx * d_ddout.broadcast(size); + } + if (out_tensor_d_dout && in_tensor_d_dy) { + ctx.template Alloc(out_tensor_d_dout); + auto ddx = EigenVector::Flatten(*in_tensor_ddx); + auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); + auto d_dout = EigenVector::Flatten(*out_tensor_d_dout); + d_dout.device(dev) = (ddx * d_dy).sum(); + d_dout_flag = true; + } } - if (out_tensor_d_y) { - ctx.template Alloc(out_tensor_d_y); - auto ddx = EigenVector::Flatten(*in_tensor_ddx); - Eigen::DSizes size(in_tensor_ddx->numel()); - - auto d_y = EigenVector::Flatten(*out_tensor_d_y); - d_y.device(dev) = ddx * d_ddout.broadcast(size); + if (in_tensor_ddy) { + if (out_tensor_d_x && in_tensor_d_ddout) { + ctx.template Alloc(out_tensor_d_x); + auto ddy = EigenVector::Flatten(*in_tensor_ddy); + Eigen::DSizes size(in_tensor_ddy->numel()); + auto d_x = EigenVector::Flatten(*out_tensor_d_x); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + d_x.device(dev) = ddy * d_ddout.broadcast(size); + } + if (out_tensor_d_dout && in_tensor_d_dx) { + ctx.template Alloc(out_tensor_d_dout); + auto ddy = EigenVector::Flatten(*in_tensor_ddy); + auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); + auto d_dout = EigenVector::Flatten(*out_tensor_d_dout); + if (d_dout_flag) { + d_dout.device(dev) += (ddy * d_dx).sum(); + } else { + d_dout.device(dev) = (ddy * d_dx).sum(); + } + } } - if (out_tensor_d_dout) { - ctx.template Alloc(out_tensor_d_dout); - auto ddx = EigenVector::Flatten(*in_tensor_ddx); - auto ddy = EigenVector::Flatten(*in_tensor_ddy); - auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); - auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); - auto d_dout = EigenVector::Flatten(*out_tensor_d_dout); - d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum(); + if (in_tensor_d_dx) { + if (out_tensor_d_ddy) { + ctx.template Alloc(out_tensor_d_ddy); + auto dout = EigenVector::Flatten(*in_tensor_dout); + auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); + auto d_ddy = EigenVector::Flatten(*out_tensor_d_ddy); + Eigen::DSizes size(in_tensor_x->numel()); + d_ddy.device(dev) = (dout.broadcast(size) * d_dx); + d_ddy_flag = true; + } } - if (out_tensor_d_ddx) { - ctx.template Alloc(out_tensor_d_ddx); - auto dout = EigenVector::Flatten(*in_tensor_dout); - auto y = EigenVector::Flatten(*in_tensor_y); - auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); - auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); - auto d_ddx = EigenVector::Flatten(*out_tensor_d_ddx); - Eigen::DSizes size(in_tensor_y->numel()); - d_ddx.device(dev) = - (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size)); + if (in_tensor_d_dy) { + if (out_tensor_d_ddx) { + ctx.template Alloc(out_tensor_d_ddx); + auto dout = EigenVector::Flatten(*in_tensor_dout); + auto d_dy = EigenVector::Flatten(*in_tensor_d_dy); + auto d_ddx = EigenVector::Flatten(*out_tensor_d_ddx); + Eigen::DSizes size(in_tensor_y->numel()); + d_ddx.device(dev) = (dout.broadcast(size) * d_dy); + d_ddx_flag = true; + } } - if (out_tensor_d_ddy) { - ctx.template Alloc(out_tensor_d_ddy); - auto dout = EigenVector::Flatten(*in_tensor_dout); - auto x = EigenVector::Flatten(*in_tensor_x); - auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); - auto d_dx = EigenVector::Flatten(*in_tensor_d_dx); - auto d_ddy = EigenVector::Flatten(*out_tensor_d_ddy); - Eigen::DSizes size(in_tensor_x->numel()); - d_ddy.device(dev) = - (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size)); + if (in_tensor_d_ddout) { + if (out_tensor_d_ddx) { + ctx.template Alloc(out_tensor_d_ddx); + auto y = EigenVector::Flatten(*in_tensor_y); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + Eigen::DSizes size(in_tensor_y->numel()); + auto d_ddx = EigenVector::Flatten(*out_tensor_d_ddx); + if (d_ddx_flag) { + d_ddx.device(dev) += (y * d_ddout.broadcast(size)); + } else { + d_ddx.device(dev) = (y * d_ddout.broadcast(size)); + } + } + if (out_tensor_d_ddy) { + ctx.template Alloc(out_tensor_d_ddy); + auto x = EigenVector::Flatten(*in_tensor_x); + auto d_ddout = EigenVector::Flatten(*in_tensor_d_ddout); + Eigen::DSizes size(in_tensor_x->numel()); + auto d_ddy = EigenVector::Flatten(*out_tensor_d_ddy); + if (d_ddy_flag) { + d_ddy.device(dev) += (x * d_ddout.broadcast(size)); + } else { + d_ddy.device(dev) = (x * d_ddout.broadcast(size)); + } + } + } + if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_x, + Scalar(0.0), + in_tensor_x->dtype(), + out_tensor_d_x); + } + if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_y, + Scalar(0.0), + in_tensor_y->dtype(), + out_tensor_d_y); + } + if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_dout, + Scalar(0.0), + in_tensor_dout->dtype(), + out_tensor_d_dout); + } + if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_x, + Scalar(0.0), + in_tensor_x->dtype(), + out_tensor_d_ddx); + } + if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_y, + Scalar(0.0), + in_tensor_y->dtype(), + out_tensor_d_ddy); } } #else - const auto* data_d_ddout = in_tensor_d_ddout->data(); - - if (out_tensor_d_x) { - auto* data_d_x = ctx.template Alloc(out_tensor_d_x); - const auto* data_ddy = in_tensor_ddy->data(); - - const DDim& dim = out_tensor_d_x->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_x[i] = data_ddy[i] * data_d_ddout[s]; + const T* data_x = in_tensor_x->data(); + const T* data_y = in_tensor_y->data(); + const T* data_dout = in_tensor_dout->data(); + const T* data_ddx = in_tensor_ddx ? in_tensor_ddx->data() : nullptr; + const T* data_ddy = in_tensor_ddy ? in_tensor_ddy->data() : nullptr; + const T* data_d_dx = in_tensor_d_dx ? in_tensor_d_dx->data() : nullptr; + const T* data_d_dy = in_tensor_d_dy ? in_tensor_d_dy->data() : nullptr; + const T* data_d_ddout = + in_tensor_d_ddout ? in_tensor_d_ddout->data() : nullptr; + + bool d_dout_flag = false; + bool d_ddx_flag = false; + bool d_ddy_flag = false; + + if (data_ddx) { + if (out_tensor_d_y && data_d_ddout) { + auto* data_d_y = ctx.template Alloc(out_tensor_d_y); + const DDim& dim = out_tensor_d_y->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_y[i] = data_ddx[i] * data_d_ddout[s]; + } } - } - - if (out_tensor_d_y) { - auto* data_d_y = ctx.template Alloc(out_tensor_d_y); - const auto* data_ddx = in_tensor_ddx->data(); - - const DDim& dim = out_tensor_d_y->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_y[i] = data_ddx[i] * data_d_ddout[s]; + if (out_tensor_d_dout && data_d_dy) { + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); + const DDim& dim = in_tensor_x->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_d_dout[s] = data_ddx[i] * data_d_dy[i]; + } else { + data_d_dout[s] += data_ddx[i] * data_d_dy[i]; + } + new_s = false; + } + d_dout_flag = true; } } - if (out_tensor_d_dout) { - auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); - auto* data_ddx = in_tensor_ddx->data(); - auto* data_ddy = in_tensor_ddy->data(); - auto* data_d_dx = in_tensor_d_dx->data(); - auto* data_d_dy = in_tensor_d_dy->data(); - - const DDim& dim = in_tensor_ddx->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - bool new_s = false; - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) { - ++s; - new_s = true; + if (data_ddy) { + if (out_tensor_d_x && data_d_ddout) { + auto* data_d_x = ctx.template Alloc(out_tensor_d_x); + const DDim& dim = out_tensor_d_x->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_x[i] = data_ddy[i] * data_d_ddout[s]; } - if (new_s) { - data_d_dout[s] = - data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i]; + } + if (out_tensor_d_dout && data_d_dx) { + auto* data_d_dout = ctx.template Alloc(out_tensor_d_dout); + const DDim& dim = in_tensor_x->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + bool new_s = false; + if (d_dout_flag) { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + } + data_d_dout[s] += data_ddy[i] * data_d_dx[i]; + } } else { - data_d_dout[s] += - data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i]; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) { + ++s; + new_s = true; + } + if (new_s) { + data_d_dout[s] = data_ddy[i] * data_d_dx[i]; + } else { + data_d_dout[s] += data_ddy[i] * data_d_dx[i]; + } + new_s = false; + } } - new_s = false; } } - if (out_tensor_d_ddx) { - auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); - auto* data_dout = in_tensor_dout->data(); - auto* data_d_dy = in_tensor_d_dy->data(); - auto* data_y = in_tensor_y->data(); - auto* data_d_ddout = in_tensor_d_ddout->data(); - - const DDim& dim = out_tensor_d_ddx->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; - - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_ddx[i] = - data_dout[s] * data_d_dy[i] + data_y[i] * data_d_ddout[s]; + if (data_d_dx) { + if (out_tensor_d_ddy) { + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); + const DDim& dim = out_tensor_d_ddy->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddy[i] = data_dout[s] * data_d_dx[i]; + } + d_ddy_flag = true; } } - if (out_tensor_d_ddy) { - auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); - auto* data_dout = in_tensor_dout->data(); - auto* data_d_dx = in_tensor_d_dx->data(); - auto* data_x = in_tensor_x->data(); - auto* data_d_ddout = in_tensor_d_ddout->data(); - - const DDim& dim = out_tensor_d_ddy->dims(); - size_t N = static_cast(product(dim)); - auto step = dim[dim.size() - 1]; - int s = -1; + if (data_d_dy) { + if (out_tensor_d_ddx) { + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); + const DDim& dim = out_tensor_d_ddx->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddx[i] = data_dout[s] * data_d_dy[i]; + } + } + d_ddx_flag = true; + } - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_d_ddy[i] = - data_dout[s] * data_d_dx[i] + data_x[i] * data_d_ddout[s]; + if (data_d_ddout) { + if (out_tensor_d_ddx) { + auto* data_d_ddx = ctx.template Alloc(out_tensor_d_ddx); + const DDim& dim = out_tensor_d_ddx->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + if (d_ddx_flag) { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddx[i] += data_y[i] * data_d_ddout[s]; + } + } else { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddx[i] = data_y[i] * data_d_ddout[s]; + } + } } + if (out_tensor_d_ddy) { + auto* data_d_ddy = ctx.template Alloc(out_tensor_d_ddy); + const DDim& dim = out_tensor_d_ddy->dims(); + size_t N = static_cast(product(dim)); + auto step = dim[dim.size() - 1]; + int s = -1; + if (d_ddy_flag) { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddy[i] += data_x[i] * data_d_ddout[s]; + } + } else { + for (size_t i = 0; i < N; ++i) { + if (0 == i % step) ++s; + data_d_ddy[i] = data_x[i] * data_d_ddout[s]; + } + } + } + } + + if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) { + FullLikeKernel( + ctx, *in_tensor_x, Scalar(0.0), in_tensor_x->dtype(), out_tensor_d_x); } + if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) { + FullLikeKernel( + ctx, *in_tensor_y, Scalar(0.0), in_tensor_y->dtype(), out_tensor_d_y); + } + if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_dout, + Scalar(0.0), + in_tensor_dout->dtype(), + out_tensor_d_dout); + } + if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_x, + Scalar(0.0), + in_tensor_x->dtype(), + out_tensor_d_ddx); + } + if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) { + FullLikeKernel(ctx, + *in_tensor_y, + Scalar(0.0), + in_tensor_y->dtype(), + out_tensor_d_ddy); + } + #endif } }; @@ -836,65 +1340,40 @@ template void DotDoubleGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - const DenseTensor& ddx, - const DenseTensor& ddy, const DenseTensor& dout, + const paddle::optional& ddx, + const paddle::optional& ddy, DenseTensor* dx, DenseTensor* dy, DenseTensor* ddout) { - if (dx) { - dev_ctx.template Alloc(dx); - } - if (dy) { - dev_ctx.template Alloc(dy); - } - if (ddout) { - dev_ctx.template Alloc(ddout); - } DotDoubleGradFunction()( - dev_ctx, &x, &y, &dout, ddx, ddy, dx, dy, ddout); + dev_ctx, &x, &y, &dout, ddx.get_ptr(), ddy.get_ptr(), dx, dy, ddout); } template void DotTripleGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - const DenseTensor& ddx, - const DenseTensor& ddy, - const DenseTensor& d_dx, - const DenseTensor& d_dy, const DenseTensor& dout, - const DenseTensor& d_ddout, + const paddle::optional& ddx, + const paddle::optional& ddy, + const paddle::optional& d_dx, + const paddle::optional& d_dy, + const paddle::optional& d_ddout, DenseTensor* d_x, DenseTensor* d_y, DenseTensor* d_ddx, DenseTensor* d_ddy, DenseTensor* d_dout) { - if (d_x) { - dev_ctx.template Alloc(d_x); - } - if (d_y) { - dev_ctx.template Alloc(d_y); - } - if (d_ddx) { - dev_ctx.template Alloc(d_ddx); - } - if (d_ddy) { - dev_ctx.template Alloc(d_ddy); - } - if (d_dout) { - dev_ctx.template Alloc(d_dout); - } - DotTripleGradFunction()(dev_ctx, &x, &y, - ddx, - ddy, - d_dx, - d_dy, - dout, - d_ddout, + &dout, + ddx.get_ptr(), + ddy.get_ptr(), + d_dx.get_ptr(), + d_dy.get_ptr(), + d_ddout.get_ptr(), d_x, d_y, d_dout, diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index 1bc29a34d4..a9dac3e493 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -473,27 +473,13 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const DenseTensor& dout, - const paddle::optional& ddx_opt, - const paddle::optional& ddy_opt, + const paddle::optional& ddx, + const paddle::optional& ddy, bool transpose_x, bool transpose_y, DenseTensor* dx, DenseTensor* dy, DenseTensor* ddout) { - paddle::optional ddx; - paddle::optional ddy; - if (!ddx_opt && (dy || ddout)) { - DenseTensor ddx_tmp = phi::FullLike(dev_ctx, x, Scalar(0.0)); - ddx = paddle::make_optional(ddx_tmp); - } else { - ddx = ddx_opt; - } - if (!ddy_opt && (dx || ddout)) { - DenseTensor ddy_tmp = phi::FullLike(dev_ctx, y, Scalar(0.0)); - ddy = paddle::make_optional(ddy_tmp); - } else { - ddy = ddy_opt; - } // Get dims from the input x, y, output_grad std::vector x_dims = vectorize(x.dims()); std::vector y_dims = vectorize(y.dims()); @@ -506,7 +492,7 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, // Case1 : x's or y's dim = 1 if (x_ndim == 1 && y_ndim == 1) { DotDoubleGradFunction()( - dev_ctx, &x, &y, &dout, ddx.get_ptr(), ddy.get_ptr(), dx, dy, ddout); + dev_ctx, &x, &y, &dout, &ddx, &ddy, dx, dy, ddout); return; } @@ -608,6 +594,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, ddout_flag); ddout_flag = true; } + } else if (!ddx && dy) { + FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), dy); } if (ddy) { auto ddy_mat = ddy.get(); @@ -666,6 +654,12 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, ddout, ddout_flag); } + } else if (!ddy && dx) { + FullLikeKernel(dev_ctx, x, Scalar(0.0), x.dtype(), dx); + } + if (ddout && !ddx && !ddy) { + FullLikeKernel( + dev_ctx, dout, Scalar(0.0), dout.dtype(), ddout); } if (dx) { @@ -821,7 +815,7 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, } } // Reduce sum to get grad by ReduceSum - if (dx) { + if (dx && dx_help.initialized()) { if (dx_reduce_dims.empty()) { *dx = std::move(dx_help); } else { @@ -829,8 +823,10 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, dev_ctx, dx_help, dx, dx_reduce_dims); } dx->Resize(x.dims()); + } else if (dx && !dx_help.initialized()) { + FullLikeKernel(dev_ctx, x, Scalar(0.0), x.dtype(), dx); } - if (dy) { + if (dy && dy_help.initialized()) { if (dy_reduce_dims.empty()) { *dy = std::move(dy_help); } else { @@ -838,6 +834,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, dev_ctx, dy_help, dy, dy_reduce_dims); } dy->Resize(y.dims()); + } else if (dy && !dy_help.initialized()) { + FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), dy); } if (ddout) { @@ -873,11 +871,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const DenseTensor& dout, - const paddle::optional& ddx_opt, - const paddle::optional& ddy_opt, - const paddle::optional& d_dx_opt, - const paddle::optional& d_dy_opt, - const paddle::optional& d_ddout_opt, + const paddle::optional& ddx, + const paddle::optional& ddy, + const paddle::optional& d_dx, + const paddle::optional& d_dy, + const paddle::optional& d_ddout, bool transpose_x, bool transpose_y, DenseTensor* out_d_x, @@ -885,50 +883,6 @@ void MatmulTripleGradKernel(const Context& dev_ctx, DenseTensor* out_d_dout, DenseTensor* out_d_ddx, DenseTensor* out_d_ddy) { - paddle::optional ddx; - paddle::optional ddy; - paddle::optional d_dx; - paddle::optional d_dy; - paddle::optional d_ddout; - - if (!ddx_opt && (out_d_y || out_d_dout)) { - DenseTensor ddx_tmp = - phi::FullLike(dev_ctx, x, static_cast(0.0)); - ddx = paddle::make_optional(ddx_tmp); - } else { - ddx = ddx_opt; - } - if (!ddy_opt && (out_d_x || out_d_dout)) { - DenseTensor ddy_tmp = - phi::FullLike(dev_ctx, y, static_cast(0.0)); - ddy = paddle::make_optional(ddy_tmp); - } else { - ddy = ddy_opt; - } - - if (!d_ddout_opt && (out_d_y || out_d_x || out_d_ddy || out_d_ddx)) { - DenseTensor d_ddout_tmp = - phi::FullLike(dev_ctx, dout, static_cast(0.0)); - d_ddout = paddle::make_optional(d_ddout_tmp); - } else { - d_ddout = d_ddout_opt; - } - - if (!d_dx_opt && (out_d_ddy || out_d_dout)) { - DenseTensor d_dx_tmp = - phi::FullLike(dev_ctx, x, static_cast(0.0)); - d_dx = paddle::make_optional(d_dx_tmp); - } else { - d_dx = d_dx_opt; - } - - if (!d_dy_opt && (out_d_ddx || out_d_dout)) { - DenseTensor d_dy_tmp = - phi::FullLike(dev_ctx, y, static_cast(0.0)); - d_dy = paddle::make_optional(d_dy_tmp); - } else { - d_dy = d_dy_opt; - } // Get dims from the input x, y, output_grad std::vector x_dims = vectorize(x.dims()); std::vector y_dims = vectorize(y.dims()); @@ -944,12 +898,12 @@ void MatmulTripleGradKernel(const Context& dev_ctx, DotTripleGradFunction()(dev_ctx, &x, &y, - ddx.get_ptr(), - ddy.get_ptr(), - d_dx.get_ptr(), - d_dy.get_ptr(), &dout, - d_ddout.get_ptr(), + &ddx, + &ddy, + &d_dx, + &d_dy, + &d_ddout, out_d_x, out_d_y, out_d_dout, @@ -1047,7 +1001,7 @@ void MatmulTripleGradKernel(const Context& dev_ctx, if (out_d_ddy_dims != y_help.dims()) { out_d_ddy->Resize(y_help.dims()); } - if (dout_conj.IsInitialized()) { + if (!dout_conj.IsInitialized()) { dout_conj = Conj(dev_ctx, dout_help); } x_conj = Conj(dev_ctx, x_help); @@ -1108,6 +1062,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx, out_d_y, false); } + } else if (out_d_y) { + FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y); } if (out_d_x && ddy) { if (transpose_x && transpose_y) { @@ -1155,6 +1111,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx, out_d_x, false); } + } else if (out_d_x) { + FullLikeKernel(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x); } // equations: @@ -1269,6 +1227,15 @@ void MatmulTripleGradKernel(const Context& dev_ctx, } d_ddy_flag = true; } + } else { + // d_ddout is none + if (out_d_x) { + FullLikeKernel(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x); + } + + if (out_d_y) { + FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y); + } } if (d_dy) { @@ -1439,6 +1406,19 @@ void MatmulTripleGradKernel(const Context& dev_ctx, out_d_ddy->Resize(out_d_ddy_dims); } } + + if (out_d_dout && !out_d_dout->IsInitialized()) { + FullLikeKernel( + dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout); + } + + if (out_d_ddx && !out_d_ddx->IsInitialized()) { + FullLikeKernel(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx); + } + + if (out_d_ddy && !out_d_ddy->IsInitialized()) { + FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy); + } } else { // Case3: broadcast. It need cost much time to reduce sum for the // broadcast and wastes the memory. @@ -1585,7 +1565,7 @@ void MatmulTripleGradKernel(const Context& dev_ctx, } // Reduce sum to get grad by ReduceSum - if (out_d_x) { + if (out_d_x && out_dx_help.initialized()) { if (dx_reduce_dims.empty()) { *out_d_x = std::move(out_dx_help); } else { @@ -1593,9 +1573,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx, dev_ctx, out_dx_help, out_d_x, dx_reduce_dims); } out_d_x->Resize(x.dims()); + } else if (out_d_x) { + FullLikeKernel(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x); } - if (out_d_y) { + if (out_d_y && out_dy_help.initialized()) { if (dy_reduce_dims.empty()) { *out_d_y = std::move(out_dy_help); } else { @@ -1603,6 +1585,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx, dev_ctx, out_dy_help, out_d_y, dy_reduce_dims); } out_d_y->Resize(y.dims()); + } else if (out_d_y) { + FullLikeKernel(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y); } // compute d_dout @@ -1628,6 +1612,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx, transpose_y, true); } + + if (!out_d_dout->initialized()) { + FullLikeKernel( + dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout); + } } // compute d_ddx @@ -1735,13 +1724,18 @@ void MatmulTripleGradKernel(const Context& dev_ctx, true); } } - - if (dx_reduce_dims.empty()) { - *out_d_ddx = std::move(out_d_ddx_help); + if (out_d_ddx_help.initialized()) { + if (dx_reduce_dims.empty()) { + *out_d_ddx = std::move(out_d_ddx_help); + } else { + ReduceSumForMatmulGrad()( + dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims); + } } else { - ReduceSumForMatmulGrad()( - dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims); + FullLikeKernel( + dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx); } + out_d_ddx->Resize(x.dims()); } @@ -1852,12 +1846,18 @@ void MatmulTripleGradKernel(const Context& dev_ctx, } } - if (dy_reduce_dims.empty()) { - *out_d_ddy = std::move(out_d_ddy_help); + if (out_d_ddy_help.initialized()) { + if (dy_reduce_dims.empty()) { + *out_d_ddy = std::move(out_d_ddy_help); + } else { + ReduceSumForMatmulGrad()( + dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims); + } } else { - ReduceSumForMatmulGrad()( - dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims); + FullLikeKernel( + dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy); } + out_d_ddy->Resize(y.dims()); } } diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index c6fdd7dd91..0db2bc0115 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -688,5 +688,489 @@ class TestDoubleGradBasics(TestCase): np.testing.assert_array_equal(grad_out.grad.numpy(), grad_out_grad_ref) +class TestDygraphDoubleGradMatmul(TestCase): + # case1: ddy is none, no broadcast,dims != 1 + def test_matmul_double_grad_case1(self): + input_numpy_x = np.random.random([3, 3]).astype('float32') + input_numpy_y = np.random.random([3, 3]).astype('float32') + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([3, 3]), stop_gradient=False, dtype='float32' + ) + (dx,) = paddle.grad( + [out], [x], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + np.ones([3, 3]), stop_gradient=False, dtype='float32' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dx], + [x, y, dout], + [ddx], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.zeros([3, 3], dtype="float32") + dy_double_grad_expected = np.matmul( + np.ones([3, 3], dtype="float32"), + np.ones([3, 3], dtype="float32"), + ) + ddout_expected = np.matmul( + np.ones([3, 3], dtype="float32"), input_numpy_y + ) + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case2: ddx is none,no broadcast, dims != 1 + def test_matmul_double_grad_case2(self): + input_numpy_x = np.random.random([3, 3]).astype('float32') + input_numpy_y = np.random.random([3, 3]).astype('float32') + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([3, 3]), stop_gradient=False, dtype='float32' + ) + (dy,) = paddle.grad( + [out], [y], [dout], retain_graph=True, create_graph=True + ) + ddy = paddle.to_tensor( + np.ones([3, 3]), stop_gradient=False, dtype='float32' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dy], + [x, y, dout], + [ddy], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.matmul( + np.ones([3, 3], dtype="float32"), + np.ones([3, 3], dtype="float32"), + ) + dy_double_grad_expected = np.zeros([3, 3], dtype="float32") + ddout_expected = np.matmul( + input_numpy_x, np.ones([3, 3], dtype="float32") + ) + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case3: ddx is none, dims = 1 + def test_matmul_double_grad_case3(self): + input_numpy_x = np.random.random([3]).astype('float32') + input_numpy_y = np.random.random([3]).astype('float32') + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([1]), stop_gradient=False, dtype='float32' + ) + (dy,) = paddle.grad( + [out], [y], [dout], retain_graph=True, create_graph=True + ) + ddy = paddle.to_tensor( + np.ones([3]), stop_gradient=False, dtype='float32' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dy], + [x, y, dout], + [ddy], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.ones([3], dtype="float32") + dy_double_grad_expected = np.zeros([3], dtype="float32") + ddout_expected = np.matmul( + input_numpy_x, np.ones([3], dtype="float32") + ) + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case4: ddy is none, dims = 1 + def test_matmul_double_grad_case4(self): + input_numpy_x = np.random.random([3]).astype('float32') + input_numpy_y = np.random.random([3]).astype('float32') + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([1]), stop_gradient=False, dtype='float32' + ) + (dx,) = paddle.grad( + [out], [x], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + np.ones([3]), stop_gradient=False, dtype='float32' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dx], + [x, y, dout], + [ddx], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.zeros([3], dtype="float32") + dy_double_grad_expected = np.ones([3], dtype="float32") + ddout_expected = np.matmul( + input_numpy_y, np.ones([3], dtype="float32") + ) + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case5: ddx is none, broadcast, dims != 1 + def test_matmul_double_grad_case5(self): + input_numpy_x = np.random.random([2, 1]).astype('float32') + input_numpy_y = np.random.random([1]).astype('float32') + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([2]), stop_gradient=False, dtype='float32' + ) + (dy,) = paddle.grad( + [out], [y], [dout], retain_graph=True, create_graph=True + ) + ddy = paddle.to_tensor( + np.ones([1]), stop_gradient=False, dtype='float32' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dy], + [x, y, dout], + [ddy], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.ones([2, 1], dtype="float32") + dy_double_grad_expected = np.zeros([1], dtype="float32") + ddout_expected = np.matmul( + input_numpy_x, np.ones([1], dtype="float32") + ) + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case6: ddy is none, broadcast, dims != 1 + def test_matmul_double_grad_case6(self): + input_numpy_x = np.random.random([2, 1]).astype('float32') + input_numpy_y = np.random.random([1]).astype('float32') + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([2]), stop_gradient=False, dtype='float32' + ) + (dx,) = paddle.grad( + [out], [x], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + np.ones([2, 1]), stop_gradient=False, dtype='float32' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dx], + [x, y, dout], + [ddx], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.zeros([2, 1], dtype="float32") + dy_double_grad_expected = np.ones([1], dtype="float32") * 2 + ddout_expected = np.ones([2], dtype="float32") * input_numpy_y[0] + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case7: ddx is none, dims = 1, complex dtype + def test_matmul_double_grad_case7(self): + input_numpy_x = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random([3]).astype('float32') + input_numpy_y = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random([3]).astype('float32') + input_numpy_y_conj = np.conjugate(input_numpy_y) + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='complex64' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='complex64' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([1]), stop_gradient=False, dtype='complex64' + ) + (dx,) = paddle.grad( + [out], [x], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + np.ones([3]), stop_gradient=False, dtype='complex64' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dx], + [x, y, dout], + [ddx], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.zeros( + [3], dtype="float32" + ) + 0j * np.zeros([3], dtype="float32") + dy_double_grad_expected = np.ones( + [3], dtype="float32" + ) + 0j * np.ones([3], dtype="float32") + ddout_expected = np.matmul( + input_numpy_y_conj, np.ones([3], dtype="float32") + ) + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case8: ddy is none, dims = 1, complex dtype + def test_matmul_double_grad_case8(self): + input_numpy_x = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random([3]).astype('float32') + input_numpy_y = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random([3]).astype('float32') + input_numpy_x_conj = np.conjugate(input_numpy_x) + + def actual(): + x = paddle.to_tensor( + input_numpy_x, stop_gradient=False, dtype='complex64' + ) + y = paddle.to_tensor( + input_numpy_y, stop_gradient=False, dtype='complex64' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + np.ones([1]), stop_gradient=False, dtype='complex64' + ) + (dy,) = paddle.grad( + [out], [y], [dout], retain_graph=True, create_graph=True + ) + ddy = paddle.to_tensor( + np.ones([3]), stop_gradient=False, dtype='complex64' + ) + dx_double_grad, dy_double_grad, ddout = paddle.grad( + [dy], + [x, y, dout], + [ddy], + retain_graph=True, + create_graph=True, + ) + return dx_double_grad, dy_double_grad, ddout + + def expected(): + dx_double_grad_expected = np.ones([3], dtype="float32") + dy_double_grad_expected = np.zeros([3], dtype="float32") + ddout_expected = np.matmul( + input_numpy_x_conj, np.ones([3], dtype="float32") + ) + return ( + dx_double_grad_expected, + dy_double_grad_expected, + ddout_expected, + ) + + expected_results = expected() + places = ["cpu"] + if paddle.is_compiled_with_cuda(): + places.append("gpu") + for place in places: + paddle.device.set_device(place) + actual_results = actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py index 570e6cf5c3..3f1283864d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py @@ -321,5 +321,1020 @@ class TestDygraphTripleGradBradcastCase(TestCase): fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) +# d_ddout is none, dtype is float32 +class TestDygraphTripleGradMatmulcase1(TestCase): + def setUp(self): + self.input_numpy_x = None + self.input_numpy_y = None + self.input_numpy_dout = None + self.input_numpy_ddx = None + self.input_numpy_ddy = None + self.places = ["cpu"] + if paddle.is_compiled_with_cuda(): + self.places.append("gpu") + + def actual(self): + x = paddle.to_tensor( + self.input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + self.input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + self.input_numpy_dout, stop_gradient=False, dtype='float32' + ) + (dx, dy) = paddle.grad( + [out], [x, y], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + self.input_numpy_ddx, stop_gradient=False, dtype='float32' + ) + ddy = paddle.to_tensor( + self.input_numpy_ddy, stop_gradient=False, dtype='float32' + ) + dx_double_grad, dy_double_grad = paddle.grad( + [dx, dy], + [x, y], + [ddx, ddy], + retain_graph=True, + create_graph=True, + ) + d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad( + [dx_double_grad, dy_double_grad], + [x, y, dout, ddx, ddy], + retain_graph=False, + create_graph=False, + ) + return d_x, d_y, d_dout, d_ddx, d_ddy + + # case1: d_ddout is none, dims != 1 + def test_matmul_triple_grad_case1(self): + def init_data(): + self.input_numpy_x = np.random.random([3, 3]).astype('float32') + self.input_numpy_y = np.random.random([3, 3]).astype('float32') + self.input_numpy_dout = np.ones([3, 3], dtype="float32") + self.input_numpy_ddx = np.ones([3, 3], dtype="float32") + self.input_numpy_ddy = np.ones([3, 3], dtype="float32") + + init_data() + d_x_expected = np.zeros([3, 3], dtype="float32") + d_y_expected = np.zeros([3, 3], dtype="float32") + d_dout_expected = np.ones([3, 3], dtype="float32") * 6 + d_ddx_expected = np.ones([3, 3], dtype="float32") * 3 + d_ddy_expected = np.ones([3, 3], dtype="float32") * 3 + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case2: d_ddout is none, dims = 1 + def test_matmul_triple_grad_case2(self): + def init_data(): + self.input_numpy_x = np.random.random( + [ + 3, + ] + ).astype('float32') + self.input_numpy_y = np.random.random( + [ + 3, + ] + ).astype('float32') + self.input_numpy_dout = np.ones([1], dtype="float32") + self.input_numpy_ddx = np.ones([3], dtype="float32") + self.input_numpy_ddy = np.ones([3], dtype="float32") + + init_data() + d_x_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_y_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_dout_expected = np.ones([1], dtype="float32") * 6 + d_ddx_expected = np.ones( + [ + 3, + ], + dtype="float32", + ) + d_ddy_expected = np.ones( + [ + 3, + ], + dtype="float32", + ) + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # case3: d_ddout is none , with broadcast + def test_matmul_triple_grad_case3(self): + def init_data(): + self.input_numpy_x = np.random.random([3, 1]).astype('float32') + self.input_numpy_y = np.random.random( + [ + 1, + ] + ).astype('float32') + self.input_numpy_dout = np.ones([3], dtype="float32") + self.input_numpy_ddx = np.ones([3, 1], dtype="float32") + self.input_numpy_ddy = np.ones([1], dtype="float32") + + init_data() + d_x_expected = np.zeros([3, 1], dtype="float32") + d_y_expected = np.zeros([1], dtype="float32") + d_dout_expected = ( + np.ones( + [ + 3, + ], + dtype="float32", + ) + * 2 + ) + d_ddx_expected = np.ones([3, 1], dtype="float32") + d_ddy_expected = np.ones([1], dtype="float32") * 3 + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + +# d_ddout is none, dtype is complex64 +class TestDygraphTripleGradMatmulcase2(TestCase): + def setUp(self): + self.input_numpy_x = None + self.input_numpy_y = None + self.input_numpy_dout = None + self.input_numpy_ddx = None + self.input_numpy_ddy = None + self.input_numpy_ddx_conj = None + self.input_numpy_ddy_conj = None + self.input_numpy_dout_conj = None + self.places = ["cpu"] + if paddle.is_compiled_with_cuda(): + self.places.append("gpu") + + def actual(self): + x = paddle.to_tensor( + self.input_numpy_x, stop_gradient=False, dtype='complex64' + ) + y = paddle.to_tensor( + self.input_numpy_y, stop_gradient=False, dtype='complex64' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + self.input_numpy_dout, stop_gradient=False, dtype='complex64' + ) + (dx, dy) = paddle.grad( + [out], [x, y], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + self.input_numpy_ddx, stop_gradient=False, dtype='complex64' + ) + ddy = paddle.to_tensor( + self.input_numpy_ddy, stop_gradient=False, dtype='complex64' + ) + dx_double_grad, dy_double_grad = paddle.grad( + [dx, dy], + [x, y], + [ddx, ddy], + retain_graph=True, + create_graph=True, + ) + d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad( + [dx_double_grad, dy_double_grad], + [x, y, dout, ddx, ddy], + retain_graph=False, + create_graph=False, + ) + return d_x, d_y, d_dout, d_ddx, d_ddy + + # case1: no d_ddout, dims = 1, dtype is complex64 + def test_matmul_triple_grad_case1(self): + def init_data(): + self.input_numpy_x = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random( + [ + 3, + ] + ).astype( + 'float32' + ) + self.input_numpy_y = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random( + [ + 3, + ] + ).astype( + 'float32' + ) + self.input_numpy_dout = np.ones( + [ + 1, + ], + dtype="float32", + ) + self.input_numpy_ddx = np.ones( + [ + 3, + ], + dtype="float32", + ) + self.input_numpy_ddy = np.ones( + [ + 3, + ], + dtype="float32", + ) + self.input_numpy_ddx_conj = np.conjugate(self.input_numpy_ddx) + self.input_numpy_ddy_conj = np.conjugate(self.input_numpy_ddy) + self.input_numpy_dout_conj = np.conjugate(self.input_numpy_dout) + + init_data() + d_x_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_y_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_dout_expected = np.matmul( + self.input_numpy_ddy_conj, + np.ones( + [ + 3, + ], + dtype="float32", + ), + ) + np.matmul( + self.input_numpy_ddx_conj, + np.ones( + [ + 3, + ], + dtype="float32", + ), + ) + d_ddx_expected = ( + np.ones( + [ + 3, + ], + dtype="float32", + ) + * self.input_numpy_dout_conj[0] + ) + d_ddy_expected = ( + np.ones( + [ + 3, + ], + dtype="float32", + ) + * self.input_numpy_dout_conj[0] + ) + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + +# d_ddout is none, d_dx is none, dtype is float32 +class TestDygraphTripleGradMatmulcase3(TestCase): + def setUp(self): + self.input_numpy_x = None + self.input_numpy_y = None + self.input_numpy_dout = None + self.input_numpy_ddx = None + self.input_numpy_ddy = None + self.places = ["cpu"] + if paddle.is_compiled_with_cuda(): + self.places.append("gpu") + + def actual(self): + x = paddle.to_tensor( + self.input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + self.input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + self.input_numpy_dout, stop_gradient=False, dtype='float32' + ) + (dx, dy) = paddle.grad( + [out], [x, y], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + self.input_numpy_ddx, stop_gradient=False, dtype='float32' + ) + ddy = paddle.to_tensor( + self.input_numpy_ddy, stop_gradient=False, dtype='float32' + ) + (dy_double_grad,) = paddle.grad( + [dx, dy], + [y], + [ddx, ddy], + retain_graph=True, + create_graph=True, + ) + d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad( + [dy_double_grad], + [x, y, dout, ddx, ddy], + retain_graph=False, + create_graph=False, + ) + return d_x, d_y, d_dout, d_ddx, d_ddy + + # case1: d_ddout is none, d_dx is none, dims != 1 + def test_matmul_triple_grad_case1(self): + def init_data(): + self.input_numpy_x = np.random.random([3, 3]).astype('float32') + self.input_numpy_y = np.random.random([3, 3]).astype('float32') + self.input_numpy_dout = np.ones([3, 3], dtype="float32") + self.input_numpy_ddx = np.ones([3, 3], dtype="float32") + self.input_numpy_ddy = np.ones([3, 3], dtype="float32") + + init_data() + d_x_expected = np.zeros([3, 3], dtype="float32") + d_y_expected = np.zeros([3, 3], dtype="float32") + d_dout_expected = np.ones([3, 3], dtype="float32") * 3 + d_ddx_expected = np.ones([3, 3], dtype="float32") * 3 + d_ddy_expected = np.zeros([3, 3], dtype="float32") + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # #case2: d_ddout is none, d_dx is none, dims = 1 + def test_matmul_triple_grad_case2(self): + def init_data(): + self.input_numpy_x = np.random.random( + [ + 3, + ] + ).astype('float32') + self.input_numpy_y = np.random.random( + [ + 3, + ] + ).astype('float32') + self.input_numpy_dout = np.ones([1], dtype="float32") + self.input_numpy_ddx = np.ones([3], dtype="float32") + self.input_numpy_ddy = np.ones([3], dtype="float32") + + init_data() + d_x_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_y_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_dout_expected = np.ones([1], dtype="float32") * 3 + d_ddx_expected = np.ones( + [ + 3, + ], + dtype="float32", + ) + d_ddy_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # #case3: d_ddout is none, d_dx is none , with broadcast + def test_matmul_triple_grad_case3(self): + def init_data(): + self.input_numpy_x = np.random.random([3, 1]).astype('float32') + self.input_numpy_y = np.random.random( + [ + 1, + ] + ).astype('float32') + self.input_numpy_dout = np.ones([3], dtype="float32") + self.input_numpy_ddx = np.ones([3, 1], dtype="float32") + self.input_numpy_ddy = np.ones([1], dtype="float32") + + init_data() + d_x_expected = np.zeros([3, 1], dtype="float32") + d_y_expected = np.zeros([1], dtype="float32") + d_dout_expected = np.ones( + [ + 3, + ], + dtype="float32", + ) + d_ddx_expected = np.ones([3, 1], dtype="float32") + d_ddy_expected = np.zeros([1], dtype="float32") + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + +# d_ddout is none, d_dx is none, dtype is complex64 +class TestDygraphTripleGradMatmulcase4(TestCase): + def setUp(self): + self.input_numpy_x = None + self.input_numpy_y = None + self.input_numpy_dout = None + self.input_numpy_ddx = None + self.input_numpy_ddy = None + self.input_numpy_ddx_conj = None + self.input_numpy_dout_conj = None + self.places = ["cpu"] + if paddle.is_compiled_with_cuda(): + self.places.append("gpu") + + def actual(self): + x = paddle.to_tensor( + self.input_numpy_x, stop_gradient=False, dtype='complex64' + ) + y = paddle.to_tensor( + self.input_numpy_y, stop_gradient=False, dtype='complex64' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + self.input_numpy_dout, stop_gradient=False, dtype='complex64' + ) + (dx, dy) = paddle.grad( + [out], [x, y], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + self.input_numpy_ddx, stop_gradient=False, dtype='complex64' + ) + ddy = paddle.to_tensor( + self.input_numpy_ddy, stop_gradient=False, dtype='complex64' + ) + (dy_double_grad,) = paddle.grad( + [dx, dy], + [y], + [ddx, ddy], + retain_graph=True, + create_graph=True, + ) + d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad( + [dy_double_grad], + [x, y, dout, ddx, ddy], + retain_graph=False, + create_graph=False, + ) + return d_x, d_y, d_dout, d_ddx, d_ddy + + # case1: no d_ddout,no d_dx, dims = 1 + def test_matmul_triple_grad_case1(self): + def init_data(): + self.input_numpy_x = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random( + [ + 3, + ] + ).astype( + 'float32' + ) + self.input_numpy_y = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random( + [ + 3, + ] + ).astype( + 'float32' + ) + self.input_numpy_dout = np.ones( + [ + 1, + ], + dtype="float32", + ) + self.input_numpy_ddx = np.ones( + [ + 3, + ], + dtype="float32", + ) + self.input_numpy_ddy = np.ones( + [ + 3, + ], + dtype="float32", + ) + self.input_numpy_ddx_conj = np.conjugate(self.input_numpy_ddx) + self.input_numpy_dout_conj = np.conjugate(self.input_numpy_dout) + + init_data() + d_x_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_y_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_dout_expected = np.matmul( + self.input_numpy_ddx_conj, + np.ones( + [ + 3, + ], + dtype="float32", + ), + ) + d_ddx_expected = ( + np.ones( + [ + 3, + ], + dtype="float32", + ) + * self.input_numpy_dout_conj[0] + ) + d_ddy_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + +# d_ddout is none, d_dy is none, dtype is float32 +class TestDygraphTripleGradMatmulcase5(TestCase): + def setUp(self): + self.input_numpy_x = None + self.input_numpy_y = None + self.input_numpy_dout = None + self.input_numpy_ddx = None + self.input_numpy_ddy = None + self.places = ["cpu"] + if paddle.is_compiled_with_cuda(): + self.places.append("gpu") + + def actual(self): + x = paddle.to_tensor( + self.input_numpy_x, stop_gradient=False, dtype='float32' + ) + y = paddle.to_tensor( + self.input_numpy_y, stop_gradient=False, dtype='float32' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + self.input_numpy_dout, stop_gradient=False, dtype='float32' + ) + (dx, dy) = paddle.grad( + [out], [x, y], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + self.input_numpy_ddx, stop_gradient=False, dtype='float32' + ) + ddy = paddle.to_tensor( + self.input_numpy_ddy, stop_gradient=False, dtype='float32' + ) + (dx_double_grad,) = paddle.grad( + [dx, dy], + [x], + [ddx, ddy], + retain_graph=True, + create_graph=True, + ) + d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad( + [dx_double_grad], + [x, y, dout, ddx, ddy], + retain_graph=False, + create_graph=False, + ) + return d_x, d_y, d_dout, d_ddx, d_ddy + + # case1: d_ddout is none, d_dy is none, dims != 1 + def test_matmul_triple_grad_case1(self): + def init_data(): + self.input_numpy_x = np.random.random([3, 3]).astype('float32') + self.input_numpy_y = np.random.random([3, 3]).astype('float32') + self.input_numpy_dout = np.ones([3, 3], dtype="float32") + self.input_numpy_ddx = np.ones([3, 3], dtype="float32") + self.input_numpy_ddy = np.ones([3, 3], dtype="float32") + + init_data() + d_x_expected = np.zeros([3, 3], dtype="float32") + d_y_expected = np.zeros([3, 3], dtype="float32") + d_dout_expected = np.ones([3, 3], dtype="float32") * 3 + d_ddx_expected = np.zeros([3, 3], dtype="float32") * 3 + d_ddy_expected = np.ones([3, 3], dtype="float32") * 3 + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # #case2: d_ddout is none, d_dy is none, dims = 1 + def test_matmul_triple_grad_case2(self): + def init_data(): + self.input_numpy_x = np.random.random( + [ + 3, + ] + ).astype('float32') + self.input_numpy_y = np.random.random( + [ + 3, + ] + ).astype('float32') + self.input_numpy_dout = np.ones([1], dtype="float32") + self.input_numpy_ddx = np.ones([3], dtype="float32") + self.input_numpy_ddy = np.ones([3], dtype="float32") + + init_data() + d_x_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_y_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_dout_expected = np.ones([1], dtype="float32") * 3 + d_ddx_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_ddy_expected = np.ones( + [ + 3, + ], + dtype="float32", + ) + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + # #case3: d_ddout is none, d_dy is none , with broadcast + def test_matmul_triple_grad_case3(self): + def init_data(): + self.input_numpy_x = np.random.random([3, 1]).astype('float32') + self.input_numpy_y = np.random.random( + [ + 1, + ] + ).astype('float32') + self.input_numpy_dout = np.ones([3], dtype="float32") + self.input_numpy_ddx = np.ones([3, 1], dtype="float32") + self.input_numpy_ddy = np.ones([1], dtype="float32") + + init_data() + d_x_expected = np.zeros([3, 1], dtype="float32") + d_y_expected = np.zeros([1], dtype="float32") + d_dout_expected = np.ones( + [ + 3, + ], + dtype="float32", + ) + d_ddx_expected = np.zeros([3, 1], dtype="float32") + d_ddy_expected = np.ones([1], dtype="float32") * 3 + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + +# d_ddout is none, d_dy is none, dtype is complex64 +class TestDygraphTripleGradMatmulcase6(TestCase): + def setUp(self): + self.input_numpy_x = None + self.input_numpy_y = None + self.input_numpy_dout = None + self.input_numpy_ddx = None + self.input_numpy_ddy = None + self.input_numpy_ddy_conj = None + self.input_numpy_dout_conj = None + self.places = ["cpu"] + if paddle.is_compiled_with_cuda(): + self.places.append("gpu") + + def actual(self): + x = paddle.to_tensor( + self.input_numpy_x, stop_gradient=False, dtype='complex64' + ) + y = paddle.to_tensor( + self.input_numpy_y, stop_gradient=False, dtype='complex64' + ) + out = paddle.matmul(x, y, False, False) + + dout = paddle.to_tensor( + self.input_numpy_dout, stop_gradient=False, dtype='complex64' + ) + (dx, dy) = paddle.grad( + [out], [x, y], [dout], retain_graph=True, create_graph=True + ) + ddx = paddle.to_tensor( + self.input_numpy_ddx, stop_gradient=False, dtype='complex64' + ) + ddy = paddle.to_tensor( + self.input_numpy_ddy, stop_gradient=False, dtype='complex64' + ) + (dx_double_grad,) = paddle.grad( + [dx, dy], + [x], + [ddx, ddy], + retain_graph=True, + create_graph=True, + ) + d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad( + [dx_double_grad], + [x, y, dout, ddx, ddy], + retain_graph=False, + create_graph=False, + ) + return d_x, d_y, d_dout, d_ddx, d_ddy + + # case1: no d_ddout,no d_dy, dims = 1 + def test_matmul_triple_grad_case1(self): + def init_data(): + self.input_numpy_x = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random( + [ + 3, + ] + ).astype( + 'float32' + ) + self.input_numpy_y = np.random.random([3]).astype( + 'float32' + ) + 1j * np.random.random( + [ + 3, + ] + ).astype( + 'float32' + ) + self.input_numpy_dout = np.ones( + [ + 1, + ], + dtype="float32", + ) + self.input_numpy_ddx = np.ones( + [ + 3, + ], + dtype="float32", + ) + self.input_numpy_ddy = np.ones( + [ + 3, + ], + dtype="float32", + ) + self.input_numpy_ddy_conj = np.conjugate(self.input_numpy_ddy) + self.input_numpy_dout_conj = np.conjugate(self.input_numpy_dout) + + init_data() + d_x_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_y_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_dout_expected = np.matmul( + self.input_numpy_ddy_conj, + np.ones( + [ + 3, + ], + dtype="float32", + ), + ) + d_ddx_expected = np.zeros( + [ + 3, + ], + dtype="float32", + ) + d_ddy_expected = ( + np.ones( + [ + 3, + ], + dtype="float32", + ) + * self.input_numpy_dout_conj[0] + ) + expected_results = ( + d_x_expected, + d_y_expected, + d_dout_expected, + d_ddx_expected, + d_ddy_expected, + ) + + for place in self.places: + paddle.device.set_device(place) + actual_results = self.actual() + for expected_result, actual_result in zip( + expected_results, actual_results + ): + np.testing.assert_allclose( + expected_result, actual_result, rtol=1e-6 + ) + + if __name__ == '__main__': unittest.main() diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index dd1f0c7e43..8659227153 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -179,7 +179,9 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_parallel_executor_seresnext_with_reduce_gpu$|\ ^test_api_impl$|\ ^test_tensordot$|\ -^disable_win_inference_test$" +^disable_win_inference_test$|\ +^test_imperative_double_grad$|\ +^test_imperative_triple_grad$" # /*==========Fixed Disabled Windows CPU OPENBLAS((PR-CI-Windows-OPENBLAS)) unittests==============================*/ -- GitLab