未验证 提交 13c4fd59 编写于 作者: C Charles-hit 提交者: GitHub

fix matmul double and triple grad (#48779)

* fix matmul double and triple grad

* remove some comment

* add matmul_double_grad unit test

* fix matmul triple grad

* fix dot triple grad and add unit test

* modify codestyle

* fix dot_grad

* refactor dot triple grad

* disable some unit test

* fix unit test

* fix unit test in double grad
上级 a1319074
......@@ -30,9 +30,9 @@ template <typename T, typename Context>
void DotDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& ddx,
const DenseTensor& ddy,
const DenseTensor& dout,
const paddle::optional<DenseTensor>& ddx_opt,
const paddle::optional<DenseTensor>& ddy_opt,
DenseTensor* dx,
DenseTensor* dy,
DenseTensor* ddout);
......@@ -41,12 +41,12 @@ template <typename T, typename Context>
void DotTripleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& ddx,
const DenseTensor& ddy,
const DenseTensor& d_dx,
const DenseTensor& d_dy,
const DenseTensor& dout,
const DenseTensor& d_ddout,
const paddle::optional<DenseTensor>& ddx,
const paddle::optional<DenseTensor>& ddy,
const paddle::optional<DenseTensor>& d_dx,
const paddle::optional<DenseTensor>& d_dy,
const paddle::optional<DenseTensor>& d_ddout,
DenseTensor* d_x,
DenseTensor* d_y,
DenseTensor* d_ddx,
......
......@@ -14,8 +14,10 @@ limitations under the License. */
#pragma once
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/complex_kernel.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
......@@ -207,8 +209,8 @@ struct DotDoubleGradFunction {
const DenseTensor* tensor_x,
const DenseTensor* tensor_y,
const DenseTensor* tensor_dout,
const DenseTensor* tensor_ddx,
const DenseTensor* tensor_ddy,
const paddle::optional<DenseTensor>* tensor_ddx_opt,
const paddle::optional<DenseTensor>* tensor_ddy_opt,
DenseTensor* tensor_dx,
DenseTensor* tensor_dy,
DenseTensor* tensor_ddout);
......@@ -220,11 +222,13 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
const DenseTensor* tensor_x,
const DenseTensor* tensor_y,
const DenseTensor* tensor_dout,
const DenseTensor* tensor_ddx,
const DenseTensor* tensor_ddy,
const paddle::optional<DenseTensor>* tensor_ddx_opt,
const paddle::optional<DenseTensor>* tensor_ddy_opt,
DenseTensor* tensor_dx,
DenseTensor* tensor_dy,
DenseTensor* tensor_ddout) {
const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
#if defined(__NVCC__) || defined(__HIPCC__)
if (1 == tensor_dout->dims().size()) {
DenseTensor tensor_dout_help;
......@@ -232,23 +236,32 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
if (tensor_dx || tensor_dy) {
tensor_dout_help = Conj<T, DeviceContext>(ctx, *tensor_dout);
}
if (tensor_dx) {
if (tensor_dx && tensor_ddy) {
ctx.template Alloc<T>(tensor_dx);
auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
Eigen::DSizes<int, 1> size(tensor_ddy->numel());
auto dx = EigenVector<T>::Flatten(*tensor_dx);
auto dout = EigenVector<T>::Flatten(tensor_dout_help);
dx.device(dev) = ddy * dout.broadcast(size);
} else if (tensor_dx && !tensor_ddy) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_x, Scalar(T(0.0, 0.0)), tensor_x->dtype(), tensor_dx);
}
if (tensor_dy) {
if (tensor_dy && tensor_ddx) {
ctx.template Alloc<T>(tensor_dy);
auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
Eigen::DSizes<int, 1> size(tensor_ddx->numel());
auto dy = EigenVector<T>::Flatten(*tensor_dy);
auto dout = EigenVector<T>::Flatten(tensor_dout_help);
dy.device(dev) = ddx * dout.broadcast(size);
} else if (tensor_dy && !tensor_ddx) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_y, Scalar(T(0.0, 0.0)), tensor_y->dtype(), tensor_dy);
}
if (tensor_ddout) {
if (tensor_ddout && tensor_ddx && tensor_ddy) {
ctx.template Alloc<T>(tensor_ddout);
DenseTensor tensor_x_help = Conj<T, DeviceContext>(ctx, *tensor_x);
DenseTensor tensor_y_help = Conj<T, DeviceContext>(ctx, *tensor_y);
......@@ -258,12 +271,28 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
ddout.device(dev) = (x * ddy + y * ddx).sum();
} else if (tensor_ddout && tensor_ddx && !tensor_ddy) {
ctx.template Alloc<T>(tensor_ddout);
DenseTensor tensor_y_help = Conj<T, DeviceContext>(ctx, *tensor_y);
auto y = EigenVector<T>::Flatten(tensor_y_help);
auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
ddout.device(dev) = (y * ddx).sum();
} else if (tensor_ddout && !tensor_ddx && tensor_ddy) {
ctx.template Alloc<T>(tensor_ddout);
DenseTensor tensor_x_help = Conj<T, DeviceContext>(ctx, *tensor_x);
auto x = EigenVector<T>::Flatten(tensor_x_help);
auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
ddout.device(dev) = (x * ddy).sum();
}
}
#else
const auto* data_dout = tensor_dout->data<T>();
if (tensor_dx) {
if (tensor_dx && tensor_ddy) {
auto* data_dx = ctx.template Alloc<T>(tensor_dx);
const auto* data_ddy = tensor_ddy->data<T>();
const DDim& dim = tensor_dx->dims();
......@@ -276,9 +305,12 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
if (0 == i % step) ++s;
data_dx[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddy[i];
}
} else if (tensor_dx && !tensor_ddy) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_x, Scalar(T(0.0, 0.0)), tensor_x->dtype(), tensor_dx);
}
if (tensor_dy) {
if (tensor_dy && tensor_ddx) {
auto* data_dy = ctx.template Alloc<T>(tensor_dy);
const auto* data_ddx = tensor_ddx->data<T>();
const DDim& dim = tensor_dy->dims();
......@@ -291,9 +323,12 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
if (0 == i % step) ++s;
data_dy[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddx[i];
}
} else if (tensor_dy && !tensor_ddx) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_y, Scalar(T(0.0, 0.0)), tensor_y->dtype(), tensor_dy);
}
if (tensor_ddout) {
if (tensor_ddout && tensor_ddx && tensor_ddy) {
auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
auto* data_x = tensor_x->data<T>();
auto* data_y = tensor_y->data<T>();
......@@ -320,6 +355,52 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
}
new_s = false;
}
} else if (tensor_ddout && tensor_ddx && !tensor_ddy) {
auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
auto* data_y = tensor_y->data<T>();
auto* data_ddx = tensor_ddx->data<T>();
const DDim& dim = tensor_dy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
new_s = true;
}
if (new_s) {
data_ddout[s] = T(data_y[i].real, -data_y[i].imag) * data_ddx[i];
} else {
data_ddout[s] += T(data_y[i].real, -data_y[i].imag) * data_ddx[i];
}
new_s = false;
}
} else if (tensor_ddout && !tensor_ddx && tensor_ddy) {
auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
auto* data_x = tensor_x->data<T>();
auto* data_ddy = tensor_ddy->data<T>();
const DDim& dim = tensor_dx->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
new_s = true;
}
if (new_s) {
data_ddout[s] = T(data_x[i].real, -data_x[i].imag) * data_ddy[i];
} else {
data_ddout[s] += T(data_x[i].real, -data_x[i].imag) * data_ddy[i];
}
new_s = false;
}
}
#endif
}
......@@ -331,88 +412,102 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
const DenseTensor* tensor_x,
const DenseTensor* tensor_y,
const DenseTensor* tensor_dout,
const DenseTensor* tensor_ddx,
const DenseTensor* tensor_ddy,
const paddle::optional<DenseTensor>* tensor_ddx_opt,
const paddle::optional<DenseTensor>* tensor_ddy_opt,
DenseTensor* tensor_dx,
DenseTensor* tensor_dy,
DenseTensor* tensor_ddout) {
const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
#if defined(__NVCC__) || defined(__HIPCC__)
if (1 == tensor_dout->dims().size()) {
auto& dev = *ctx.eigen_device();
auto x = EigenVector<T>::Flatten(*tensor_x);
auto y = EigenVector<T>::Flatten(*tensor_y);
auto dout = EigenVector<T>::Flatten(*tensor_dout);
if (tensor_dx) {
if (tensor_dx && tensor_ddy) {
ctx.template Alloc<T>(tensor_dx);
auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
Eigen::DSizes<int, 1> size(tensor_ddy->numel());
auto dx = EigenVector<T>::Flatten(*tensor_dx);
dx.device(dev) = ddy * dout.broadcast(size);
} else if (tensor_dx && !tensor_ddy) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_x, Scalar(0.0), tensor_x->dtype(), tensor_dx);
}
if (tensor_dy) {
if (tensor_dy && tensor_ddx) {
ctx.template Alloc<T>(tensor_dy);
auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
Eigen::DSizes<int, 1> size(tensor_ddx->numel());
auto dy = EigenVector<T>::Flatten(*tensor_dy);
dy.device(dev) = ddx * dout.broadcast(size);
} else if (tensor_dy && !tensor_ddx) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_y, Scalar(0.0), tensor_y->dtype(), tensor_dy);
}
if (tensor_ddout) {
if (tensor_ddout && tensor_ddx && tensor_ddy) {
ctx.template Alloc<T>(tensor_ddout);
auto x = EigenVector<T>::Flatten(*tensor_x);
auto y = EigenVector<T>::Flatten(*tensor_y);
auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
ddout.device(dev) = (x * ddy + y * ddx).sum();
} else if (tensor_ddout && tensor_ddx && !tensor_ddy) {
ctx.template Alloc<T>(tensor_ddout);
auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
ddout.device(dev) = (y * ddx).sum();
} else if (tensor_ddout && !tensor_ddx && tensor_ddy) {
ctx.template Alloc<T>(tensor_ddout);
auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
ddout.device(dev) = (x * ddy).sum();
}
}
#else
const auto* data_dout = tensor_dout->data<T>();
if (tensor_dx) {
const T* data_x = tensor_x->data<T>();
const T* data_y = tensor_y->data<T>();
const T* data_dout = tensor_dout->data<T>();
const T* data_ddx = tensor_ddx ? tensor_ddx->data<T>() : nullptr;
const T* data_ddy = tensor_ddy ? tensor_ddy->data<T>() : nullptr;
if (tensor_dx && tensor_ddy) {
auto* data_dx = ctx.template Alloc<T>(tensor_dx);
const auto* data_ddy = tensor_ddy->data<T>();
const DDim& dim = tensor_dx->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_dx[i] = data_dout[s] * data_ddy[i];
}
} else if (tensor_dx && !tensor_ddy) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_x, Scalar(0.0), tensor_x->dtype(), tensor_dx);
}
if (tensor_dy) {
if (tensor_dy && tensor_ddx) {
auto* data_dy = ctx.template Alloc<T>(tensor_dy);
const auto* data_ddx = tensor_ddx->data<T>();
const DDim& dim = tensor_dy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_dy[i] = data_dout[s] * data_ddx[i];
}
} else if (tensor_dy) {
FullLikeKernel<T, DeviceContext>(
ctx, *tensor_y, Scalar(0.0), tensor_y->dtype(), tensor_dy);
}
if (tensor_ddout) {
if (tensor_ddout && tensor_ddx && tensor_ddy) {
auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
auto* data_x = tensor_x->data<T>();
auto* data_y = tensor_y->data<T>();
auto* data_ddx = tensor_ddx->data<T>();
auto* data_ddy = tensor_ddy->data<T>();
const DDim& dim = tensor_dy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
......@@ -425,6 +520,44 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
}
new_s = false;
}
} else if (tensor_ddout && tensor_ddx && !tensor_ddy) {
auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
const DDim& dim = tensor_dy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
new_s = true;
}
if (new_s) {
data_ddout[s] = data_y[i] * data_ddx[i];
} else {
data_ddout[s] += data_y[i] * data_ddx[i];
}
new_s = false;
}
} else if (tensor_ddout && !tensor_ddx && tensor_ddy) {
auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
const DDim& dim = tensor_dx->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
new_s = true;
}
if (new_s) {
data_ddout[s] = data_x[i] * data_ddy[i];
} else {
data_ddout[s] += data_x[i] * data_ddy[i];
}
new_s = false;
}
}
#endif
}
......@@ -435,12 +568,12 @@ struct DotTripleGradFunction {
void operator()(const DeviceContext& ctx,
const DenseTensor* in_tensor_x,
const DenseTensor* in_tensor_y,
const DenseTensor* in_tensor_ddx,
const DenseTensor* in_tensor_ddy,
const DenseTensor* in_tensor_d_dx,
const DenseTensor* in_tensor_d_dy,
const DenseTensor* in_tensor_dout,
const DenseTensor* in_tensor_d_ddout,
const paddle::optional<DenseTensor>* in_tensor_ddx_opt,
const paddle::optional<DenseTensor>* in_tensor_ddy_opt,
const paddle::optional<DenseTensor>* in_tensor_d_dx_opt,
const paddle::optional<DenseTensor>* in_tensor_d_dy_opt,
const paddle::optional<DenseTensor>* in_tensor_d_ddout_opt,
DenseTensor* out_tensor_d_x,
DenseTensor* out_tensor_d_y,
DenseTensor* out_tensor_d_dout,
......@@ -455,133 +588,254 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
void operator()(const DeviceContext& ctx,
const DenseTensor* in_tensor_x,
const DenseTensor* in_tensor_y,
const DenseTensor* in_tensor_ddx,
const DenseTensor* in_tensor_ddy,
const DenseTensor* in_tensor_d_dx,
const DenseTensor* in_tensor_d_dy,
const DenseTensor* in_tensor_dout,
const DenseTensor* in_tensor_d_ddout,
const paddle::optional<DenseTensor>* in_tensor_ddx_opt,
const paddle::optional<DenseTensor>* in_tensor_ddy_opt,
const paddle::optional<DenseTensor>* in_tensor_d_dx_opt,
const paddle::optional<DenseTensor>* in_tensor_d_dy_opt,
const paddle::optional<DenseTensor>* in_tensor_d_ddout_opt,
DenseTensor* out_tensor_d_x,
DenseTensor* out_tensor_d_y,
DenseTensor* out_tensor_d_dout,
DenseTensor* out_tensor_d_ddx,
DenseTensor* out_tensor_d_ddy) {
const DenseTensor* in_tensor_ddx = in_tensor_ddx_opt->get_ptr();
const DenseTensor* in_tensor_ddy = in_tensor_ddy_opt->get_ptr();
const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
#if defined(__NVCC__) || defined(__HIPCC__)
if (1 == in_tensor_d_ddout->dims().size()) {
DenseTensor in_tensor_d_ddout_help;
if (1 == in_tensor_dout->dims().size()) {
auto& dev = *ctx.eigen_device();
if (out_tensor_d_x || out_tensor_d_y) {
in_tensor_d_ddout_help =
Conj<T, DeviceContext>(ctx, *in_tensor_d_ddout);
DenseTensor in_tensor_x_help = Conj<T, DeviceContext>(ctx, *in_tensor_x);
DenseTensor in_tensor_y_help = Conj<T, DeviceContext>(ctx, *in_tensor_y);
DenseTensor in_tensor_dout_help =
Conj<T, DeviceContext>(ctx, *in_tensor_dout);
DenseTensor in_tensor_ddx_help;
DenseTensor in_tensor_ddy_help;
if (in_tensor_ddx) {
in_tensor_ddx_help = Conj<T, DeviceContext>(ctx, *in_tensor_ddx);
}
if (out_tensor_d_x) {
auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
auto d_ddout = EigenVector<T>::Flatten(in_tensor_d_ddout_help);
d_x.device(dev) = ddy * d_ddout.broadcast(size);
if (in_tensor_ddy) {
in_tensor_ddy_help = Conj<T, DeviceContext>(ctx, *in_tensor_ddy);
}
if (out_tensor_d_y) {
auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
bool d_dout_flag = false;
bool d_ddx_flag = false;
bool d_ddy_flag = false;
if (in_tensor_ddx) {
if (out_tensor_d_y && in_tensor_d_ddout) {
ctx.template Alloc<T>(out_tensor_d_y);
auto ddx = EigenVector<T>::Flatten(in_tensor_ddx_help);
Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
auto d_y = EigenVector<T>::Flatten(*out_tensor_d_y);
auto d_ddout = EigenVector<T>::Flatten(in_tensor_d_ddout_help);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
d_y.device(dev) = ddx * d_ddout.broadcast(size);
}
if (out_tensor_d_dout) {
DenseTensor in_tensor_ddx_help =
Conj<T, DeviceContext>(ctx, *in_tensor_ddx);
DenseTensor in_tensor_ddy_help =
Conj<T, DeviceContext>(ctx, *in_tensor_ddy);
if (out_tensor_d_dout && in_tensor_d_dy) {
ctx.template Alloc<T>(out_tensor_d_dout);
auto ddx = EigenVector<T>::Flatten(in_tensor_ddx_help);
auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
auto d_dout = EigenVector<T>::Flatten(*out_tensor_d_dout);
d_dout.device(dev) = (ddx * d_dy).sum();
d_dout_flag = true;
}
}
if (in_tensor_ddy) {
if (out_tensor_d_x && in_tensor_d_ddout) {
ctx.template Alloc<T>(out_tensor_d_x);
auto ddy = EigenVector<T>::Flatten(in_tensor_ddy_help);
Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
d_x.device(dev) = ddy * d_ddout.broadcast(size);
}
if (out_tensor_d_dout && in_tensor_d_dx) {
ctx.template Alloc<T>(out_tensor_d_dout);
auto ddy = EigenVector<T>::Flatten(in_tensor_ddy_help);
auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
auto d_dout = EigenVector<T>::Flatten(*out_tensor_d_dout);
d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
if (d_dout_flag) {
d_dout.device(dev) += (ddy * d_dx).sum();
} else {
d_dout.device(dev) = (ddy * d_dx).sum();
}
}
}
if (out_tensor_d_ddx) {
DenseTensor in_tensor_dout_help =
Conj<T, DeviceContext>(ctx, *in_tensor_dout);
DenseTensor in_tensor_y_help =
Conj<T, DeviceContext>(ctx, *in_tensor_y);
if (in_tensor_d_dx) {
if (out_tensor_d_ddy) {
ctx.template Alloc<T>(out_tensor_d_ddy);
auto dout = EigenVector<T>::Flatten(in_tensor_dout_help);
auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
Eigen::DSizes<int, 1> size(in_tensor_x->numel());
d_ddy.device(dev) = (dout.broadcast(size) * d_dx);
d_ddy_flag = true;
}
}
if (in_tensor_d_dy) {
if (out_tensor_d_ddx) {
ctx.template Alloc<T>(out_tensor_d_ddx);
auto dout = EigenVector<T>::Flatten(in_tensor_dout_help);
auto y = EigenVector<T>::Flatten(in_tensor_y_help);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
auto d_ddx = EigenVector<T>::Flatten(*out_tensor_d_ddx);
Eigen::DSizes<int, 1> size(in_tensor_y->numel());
d_ddx.device(dev) =
(dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
d_ddx.device(dev) = (dout.broadcast(size) * d_dy);
d_ddx_flag = true;
}
}
if (in_tensor_d_ddout) {
if (out_tensor_d_ddx) {
ctx.template Alloc<T>(out_tensor_d_ddx);
auto y = EigenVector<T>::Flatten(in_tensor_y_help);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
Eigen::DSizes<int, 1> size(in_tensor_y->numel());
auto d_ddx = EigenVector<T>::Flatten(*out_tensor_d_ddx);
if (d_ddx_flag) {
d_ddx.device(dev) += (y * d_ddout.broadcast(size));
} else {
d_ddx.device(dev) = (y * d_ddout.broadcast(size));
}
}
if (out_tensor_d_ddy) {
DenseTensor in_tensor_dout_help =
Conj<T, DeviceContext>(ctx, *in_tensor_dout);
DenseTensor in_tensor_x_help =
Conj<T, DeviceContext>(ctx, *in_tensor_x);
auto dout = EigenVector<T>::Flatten(in_tensor_dout_help);
ctx.template Alloc<T>(out_tensor_d_ddy);
auto x = EigenVector<T>::Flatten(in_tensor_x_help);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
Eigen::DSizes<int, 1> size(in_tensor_x->numel());
d_ddy.device(dev) =
(dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
if (d_ddy_flag) {
d_ddy.device(dev) += (x * d_ddout.broadcast(size));
} else {
d_ddy.device(dev) = (x * d_ddout.broadcast(size));
}
}
}
if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_x,
Scalar(T(0.0, 0.0)),
in_tensor_x->dtype(),
out_tensor_d_x);
}
if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_y,
Scalar(T(0.0, 0.0)),
in_tensor_y->dtype(),
out_tensor_d_y);
}
if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_dout,
Scalar(T(0.0, 0.0)),
in_tensor_dout->dtype(),
out_tensor_d_dout);
}
if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_x,
Scalar(T(0.0, 0.0)),
in_tensor_x->dtype(),
out_tensor_d_ddx);
}
if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_y,
Scalar(T(0.0, 0.0)),
in_tensor_y->dtype(),
out_tensor_d_ddy);
}
}
#else
const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
if (out_tensor_d_x) {
auto* data_d_x = ctx.template Alloc<T>(out_tensor_d_x);
const auto* data_ddy = in_tensor_ddy->data<T>();
const DDim& dim = out_tensor_d_x->dims();
const T* data_x = in_tensor_x->data<T>();
const T* data_y = in_tensor_y->data<T>();
const T* data_dout = in_tensor_dout->data<T>();
const T* data_ddx = in_tensor_ddx ? in_tensor_ddx->data<T>() : nullptr;
const T* data_ddy = in_tensor_ddy ? in_tensor_ddy->data<T>() : nullptr;
const T* data_d_dx = in_tensor_d_dx ? in_tensor_d_dx->data<T>() : nullptr;
const T* data_d_dy = in_tensor_d_dy ? in_tensor_d_dy->data<T>() : nullptr;
const T* data_d_ddout =
in_tensor_d_ddout ? in_tensor_d_ddout->data<T>() : nullptr;
bool d_dout_flag = false;
bool d_ddx_flag = false;
bool d_ddy_flag = false;
if (data_ddx) {
if (out_tensor_d_y && data_d_ddout) {
auto* data_d_y = ctx.template Alloc<T>(out_tensor_d_y);
const DDim& dim = out_tensor_d_y->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_x[i] = T(data_ddy[i].real, -data_ddy[i].imag) * data_d_ddout[s];
data_d_y[i] =
T(data_ddx[i].real, -data_ddx[i].imag) * data_d_ddout[s];
}
}
if (out_tensor_d_y) {
auto* data_d_y = ctx.template Alloc<T>(out_tensor_d_y);
const auto* data_ddx = in_tensor_ddx->data<T>();
if (out_tensor_d_dout && data_d_dy) {
auto* data_d_dout = ctx.template Alloc<T>(out_tensor_d_dout);
const DDim& dim = in_tensor_x->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
new_s = true;
}
if (new_s) {
data_d_dout[s] =
T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
} else {
data_d_dout[s] +=
T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
}
new_s = false;
}
d_dout_flag = true;
}
}
const DDim& dim = out_tensor_d_y->dims();
if (data_ddy) {
if (out_tensor_d_x && data_d_ddout) {
auto* data_d_x = ctx.template Alloc<T>(out_tensor_d_x);
const DDim& dim = out_tensor_d_x->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_y[i] = T(data_ddx[i].real, -data_ddx[i].imag) * data_d_ddout[s];
data_d_x[i] =
T(data_ddy[i].real, -data_ddy[i].imag) * data_d_ddout[s];
}
}
if (out_tensor_d_dout) {
if (out_tensor_d_dout && data_d_dx) {
auto* data_d_dout = ctx.template Alloc<T>(out_tensor_d_dout);
auto* data_ddx = in_tensor_ddx->data<T>();
auto* data_ddy = in_tensor_ddy->data<T>();
auto* data_d_dx = in_tensor_d_dx->data<T>();
auto* data_d_dy = in_tensor_d_dy->data<T>();
const DDim& dim = out_tensor_d_dout->dims();
const DDim& dim = in_tensor_x->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
if (d_dout_flag) {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
}
data_d_dout[s] +=
T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i];
}
} else {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
......@@ -589,56 +843,128 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
}
if (new_s) {
data_d_dout[s] =
T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i];
} else {
data_d_dout[s] +=
T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i];
}
new_s = false;
}
}
}
}
if (data_d_dx) {
if (out_tensor_d_ddy) {
auto* data_d_ddy = ctx.template Alloc<T>(out_tensor_d_ddy);
const DDim& dim = out_tensor_d_ddy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddy[i] =
T(data_dout[s].real, -data_dout[s].imag) * data_d_dx[i];
}
d_ddy_flag = true;
}
}
if (data_d_dy) {
if (out_tensor_d_ddx) {
auto* data_d_ddx = ctx.template Alloc<T>(out_tensor_d_ddx);
auto* data_dout = in_tensor_dout->data<T>();
auto* data_d_dy = in_tensor_d_dy->data<T>();
auto* data_y = in_tensor_y->data<T>();
auto* data_d_ddout = in_tensor_d_ddout->data<T>();
const DDim& dim = out_tensor_d_ddx->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddx[i] =
T(data_dout[s].real, -data_dout[s].imag) * data_d_dy[i];
}
}
d_ddx_flag = true;
}
if (data_d_ddout) {
if (out_tensor_d_ddx) {
auto* data_d_ddx = ctx.template Alloc<T>(out_tensor_d_ddx);
const DDim& dim = out_tensor_d_ddx->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
if (d_ddx_flag) {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddx[i] +=
T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s];
}
} else {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddx[i] =
T(data_dout[s].real, -data_dout[s].imag) * data_d_dy[i] +
T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s];
}
}
}
if (out_tensor_d_ddy) {
auto* data_d_ddy = ctx.template Alloc<T>(out_tensor_d_ddy);
auto* data_dout = in_tensor_dout->data<T>();
auto* data_d_dx = in_tensor_d_dx->data<T>();
auto* data_x = in_tensor_x->data<T>();
auto* data_d_ddout = in_tensor_d_ddout->data<T>();
const DDim& dim = out_tensor_d_ddy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
if (d_ddy_flag) {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddy[i] +=
T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s];
}
} else {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddy[i] =
T(data_dout[s].real, -data_dout[s].imag) * data_d_dx[i] +
T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s];
}
}
}
}
if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_x,
Scalar(T(0.0, 0.0)),
in_tensor_x->dtype(),
out_tensor_d_x);
}
if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_y,
Scalar(T(0.0, 0.0)),
in_tensor_y->dtype(),
out_tensor_d_y);
}
if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_dout,
Scalar(T(0.0, 0.0)),
in_tensor_dout->dtype(),
out_tensor_d_dout);
}
if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_x,
Scalar(T(0.0, 0.0)),
in_tensor_x->dtype(),
out_tensor_d_ddx);
}
if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_y,
Scalar(T(0.0, 0.0)),
in_tensor_y->dtype(),
out_tensor_d_ddy);
}
#endif
}
};
......@@ -648,170 +974,348 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
void operator()(const DeviceContext& ctx,
const DenseTensor* in_tensor_x,
const DenseTensor* in_tensor_y,
const DenseTensor* in_tensor_ddx,
const DenseTensor* in_tensor_ddy,
const DenseTensor* in_tensor_d_dx,
const DenseTensor* in_tensor_d_dy,
const DenseTensor* in_tensor_dout,
const DenseTensor* in_tensor_d_ddout,
const paddle::optional<DenseTensor>* in_tensor_ddx_opt,
const paddle::optional<DenseTensor>* in_tensor_ddy_opt,
const paddle::optional<DenseTensor>* in_tensor_d_dx_opt,
const paddle::optional<DenseTensor>* in_tensor_d_dy_opt,
const paddle::optional<DenseTensor>* in_tensor_d_ddout_opt,
DenseTensor* out_tensor_d_x,
DenseTensor* out_tensor_d_y,
DenseTensor* out_tensor_d_dout,
DenseTensor* out_tensor_d_ddx,
DenseTensor* out_tensor_d_ddy) {
const DenseTensor* in_tensor_ddx = in_tensor_ddx_opt->get_ptr();
const DenseTensor* in_tensor_ddy = in_tensor_ddy_opt->get_ptr();
const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
#if defined(__NVCC__) || defined(__HIPCC__)
if (1 == in_tensor_d_ddout->dims().size()) {
if (1 == in_tensor_dout->dims().size()) {
auto& dev = *ctx.eigen_device();
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
if (out_tensor_d_x) {
ctx.template Alloc<T>(out_tensor_d_x);
auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
d_x.device(dev) = ddy * d_ddout.broadcast(size);
}
bool d_dout_flag = false;
bool d_ddx_flag = false;
bool d_ddy_flag = false;
if (out_tensor_d_y) {
if (in_tensor_ddx) {
if (out_tensor_d_y && in_tensor_d_ddout) {
ctx.template Alloc<T>(out_tensor_d_y);
auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
auto d_y = EigenVector<T>::Flatten(*out_tensor_d_y);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
d_y.device(dev) = ddx * d_ddout.broadcast(size);
}
if (out_tensor_d_dout) {
if (out_tensor_d_dout && in_tensor_d_dy) {
ctx.template Alloc<T>(out_tensor_d_dout);
auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
auto d_dout = EigenVector<T>::Flatten(*out_tensor_d_dout);
d_dout.device(dev) = (ddx * d_dy).sum();
d_dout_flag = true;
}
}
if (in_tensor_ddy) {
if (out_tensor_d_x && in_tensor_d_ddout) {
ctx.template Alloc<T>(out_tensor_d_x);
auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
d_x.device(dev) = ddy * d_ddout.broadcast(size);
}
if (out_tensor_d_dout && in_tensor_d_dx) {
ctx.template Alloc<T>(out_tensor_d_dout);
auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
auto d_dout = EigenVector<T>::Flatten(*out_tensor_d_dout);
d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
if (d_dout_flag) {
d_dout.device(dev) += (ddy * d_dx).sum();
} else {
d_dout.device(dev) = (ddy * d_dx).sum();
}
}
}
if (in_tensor_d_dx) {
if (out_tensor_d_ddy) {
ctx.template Alloc<T>(out_tensor_d_ddy);
auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
Eigen::DSizes<int, 1> size(in_tensor_x->numel());
d_ddy.device(dev) = (dout.broadcast(size) * d_dx);
d_ddy_flag = true;
}
}
if (in_tensor_d_dy) {
if (out_tensor_d_ddx) {
ctx.template Alloc<T>(out_tensor_d_ddx);
auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
auto y = EigenVector<T>::Flatten(*in_tensor_y);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
auto d_ddx = EigenVector<T>::Flatten(*out_tensor_d_ddx);
Eigen::DSizes<int, 1> size(in_tensor_y->numel());
d_ddx.device(dev) =
(dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
d_ddx.device(dev) = (dout.broadcast(size) * d_dy);
d_ddx_flag = true;
}
}
if (in_tensor_d_ddout) {
if (out_tensor_d_ddx) {
ctx.template Alloc<T>(out_tensor_d_ddx);
auto y = EigenVector<T>::Flatten(*in_tensor_y);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
Eigen::DSizes<int, 1> size(in_tensor_y->numel());
auto d_ddx = EigenVector<T>::Flatten(*out_tensor_d_ddx);
if (d_ddx_flag) {
d_ddx.device(dev) += (y * d_ddout.broadcast(size));
} else {
d_ddx.device(dev) = (y * d_ddout.broadcast(size));
}
}
if (out_tensor_d_ddy) {
ctx.template Alloc<T>(out_tensor_d_ddy);
auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
auto x = EigenVector<T>::Flatten(*in_tensor_x);
auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
Eigen::DSizes<int, 1> size(in_tensor_x->numel());
d_ddy.device(dev) =
(dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
if (d_ddy_flag) {
d_ddy.device(dev) += (x * d_ddout.broadcast(size));
} else {
d_ddy.device(dev) = (x * d_ddout.broadcast(size));
}
}
}
if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_x,
Scalar(0.0),
in_tensor_x->dtype(),
out_tensor_d_x);
}
if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_y,
Scalar(0.0),
in_tensor_y->dtype(),
out_tensor_d_y);
}
if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_dout,
Scalar(0.0),
in_tensor_dout->dtype(),
out_tensor_d_dout);
}
if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_x,
Scalar(0.0),
in_tensor_x->dtype(),
out_tensor_d_ddx);
}
if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_y,
Scalar(0.0),
in_tensor_y->dtype(),
out_tensor_d_ddy);
}
}
#else
const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
if (out_tensor_d_x) {
auto* data_d_x = ctx.template Alloc<T>(out_tensor_d_x);
const auto* data_ddy = in_tensor_ddy->data<T>();
const DDim& dim = out_tensor_d_x->dims();
const T* data_x = in_tensor_x->data<T>();
const T* data_y = in_tensor_y->data<T>();
const T* data_dout = in_tensor_dout->data<T>();
const T* data_ddx = in_tensor_ddx ? in_tensor_ddx->data<T>() : nullptr;
const T* data_ddy = in_tensor_ddy ? in_tensor_ddy->data<T>() : nullptr;
const T* data_d_dx = in_tensor_d_dx ? in_tensor_d_dx->data<T>() : nullptr;
const T* data_d_dy = in_tensor_d_dy ? in_tensor_d_dy->data<T>() : nullptr;
const T* data_d_ddout =
in_tensor_d_ddout ? in_tensor_d_ddout->data<T>() : nullptr;
bool d_dout_flag = false;
bool d_ddx_flag = false;
bool d_ddy_flag = false;
if (data_ddx) {
if (out_tensor_d_y && data_d_ddout) {
auto* data_d_y = ctx.template Alloc<T>(out_tensor_d_y);
const DDim& dim = out_tensor_d_y->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_x[i] = data_ddy[i] * data_d_ddout[s];
data_d_y[i] = data_ddx[i] * data_d_ddout[s];
}
}
if (out_tensor_d_y) {
auto* data_d_y = ctx.template Alloc<T>(out_tensor_d_y);
const auto* data_ddx = in_tensor_ddx->data<T>();
const DDim& dim = out_tensor_d_y->dims();
if (out_tensor_d_dout && data_d_dy) {
auto* data_d_dout = ctx.template Alloc<T>(out_tensor_d_dout);
const DDim& dim = in_tensor_x->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
new_s = true;
}
if (new_s) {
data_d_dout[s] = data_ddx[i] * data_d_dy[i];
} else {
data_d_dout[s] += data_ddx[i] * data_d_dy[i];
}
new_s = false;
}
d_dout_flag = true;
}
}
if (data_ddy) {
if (out_tensor_d_x && data_d_ddout) {
auto* data_d_x = ctx.template Alloc<T>(out_tensor_d_x);
const DDim& dim = out_tensor_d_x->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_y[i] = data_ddx[i] * data_d_ddout[s];
data_d_x[i] = data_ddy[i] * data_d_ddout[s];
}
}
if (out_tensor_d_dout) {
if (out_tensor_d_dout && data_d_dx) {
auto* data_d_dout = ctx.template Alloc<T>(out_tensor_d_dout);
auto* data_ddx = in_tensor_ddx->data<T>();
auto* data_ddy = in_tensor_ddy->data<T>();
auto* data_d_dx = in_tensor_d_dx->data<T>();
auto* data_d_dy = in_tensor_d_dy->data<T>();
const DDim& dim = in_tensor_ddx->dims();
const DDim& dim = in_tensor_x->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
bool new_s = false;
if (d_dout_flag) {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
}
data_d_dout[s] += data_ddy[i] * data_d_dx[i];
}
} else {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) {
++s;
new_s = true;
}
if (new_s) {
data_d_dout[s] =
data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
data_d_dout[s] = data_ddy[i] * data_d_dx[i];
} else {
data_d_dout[s] +=
data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
data_d_dout[s] += data_ddy[i] * data_d_dx[i];
}
new_s = false;
}
}
}
}
if (data_d_dx) {
if (out_tensor_d_ddy) {
auto* data_d_ddy = ctx.template Alloc<T>(out_tensor_d_ddy);
const DDim& dim = out_tensor_d_ddy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddy[i] = data_dout[s] * data_d_dx[i];
}
d_ddy_flag = true;
}
}
if (data_d_dy) {
if (out_tensor_d_ddx) {
auto* data_d_ddx = ctx.template Alloc<T>(out_tensor_d_ddx);
auto* data_dout = in_tensor_dout->data<T>();
auto* data_d_dy = in_tensor_d_dy->data<T>();
auto* data_y = in_tensor_y->data<T>();
auto* data_d_ddout = in_tensor_d_ddout->data<T>();
const DDim& dim = out_tensor_d_ddx->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddx[i] =
data_dout[s] * data_d_dy[i] + data_y[i] * data_d_ddout[s];
data_d_ddx[i] = data_dout[s] * data_d_dy[i];
}
}
d_ddx_flag = true;
}
if (data_d_ddout) {
if (out_tensor_d_ddx) {
auto* data_d_ddx = ctx.template Alloc<T>(out_tensor_d_ddx);
const DDim& dim = out_tensor_d_ddx->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
if (d_ddx_flag) {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddx[i] += data_y[i] * data_d_ddout[s];
}
} else {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddx[i] = data_y[i] * data_d_ddout[s];
}
}
}
if (out_tensor_d_ddy) {
auto* data_d_ddy = ctx.template Alloc<T>(out_tensor_d_ddy);
auto* data_dout = in_tensor_dout->data<T>();
auto* data_d_dx = in_tensor_d_dx->data<T>();
auto* data_x = in_tensor_x->data<T>();
auto* data_d_ddout = in_tensor_d_ddout->data<T>();
const DDim& dim = out_tensor_d_ddy->dims();
size_t N = static_cast<size_t>(product(dim));
auto step = dim[dim.size() - 1];
int s = -1;
if (d_ddy_flag) {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddy[i] =
data_dout[s] * data_d_dx[i] + data_x[i] * data_d_ddout[s];
data_d_ddy[i] += data_x[i] * data_d_ddout[s];
}
} else {
for (size_t i = 0; i < N; ++i) {
if (0 == i % step) ++s;
data_d_ddy[i] = data_x[i] * data_d_ddout[s];
}
}
}
}
if (out_tensor_d_x && !out_tensor_d_x->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(
ctx, *in_tensor_x, Scalar(0.0), in_tensor_x->dtype(), out_tensor_d_x);
}
if (out_tensor_d_y && !out_tensor_d_y->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(
ctx, *in_tensor_y, Scalar(0.0), in_tensor_y->dtype(), out_tensor_d_y);
}
if (out_tensor_d_dout && !out_tensor_d_dout->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_dout,
Scalar(0.0),
in_tensor_dout->dtype(),
out_tensor_d_dout);
}
if (out_tensor_d_ddx && !out_tensor_d_ddx->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_x,
Scalar(0.0),
in_tensor_x->dtype(),
out_tensor_d_ddx);
}
if (out_tensor_d_ddy && !out_tensor_d_ddy->IsInitialized()) {
FullLikeKernel<T, DeviceContext>(ctx,
*in_tensor_y,
Scalar(0.0),
in_tensor_y->dtype(),
out_tensor_d_ddy);
}
#endif
}
};
......@@ -836,65 +1340,40 @@ template <typename T, typename Context>
void DotDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& ddx,
const DenseTensor& ddy,
const DenseTensor& dout,
const paddle::optional<DenseTensor>& ddx,
const paddle::optional<DenseTensor>& ddy,
DenseTensor* dx,
DenseTensor* dy,
DenseTensor* ddout) {
if (dx) {
dev_ctx.template Alloc<T>(dx);
}
if (dy) {
dev_ctx.template Alloc<T>(dy);
}
if (ddout) {
dev_ctx.template Alloc<T>(ddout);
}
DotDoubleGradFunction<Context, T>()(
dev_ctx, &x, &y, &dout, ddx, ddy, dx, dy, ddout);
dev_ctx, &x, &y, &dout, ddx.get_ptr(), ddy.get_ptr(), dx, dy, ddout);
}
template <typename T, typename Context>
void DotTripleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& ddx,
const DenseTensor& ddy,
const DenseTensor& d_dx,
const DenseTensor& d_dy,
const DenseTensor& dout,
const DenseTensor& d_ddout,
const paddle::optional<DenseTensor>& ddx,
const paddle::optional<DenseTensor>& ddy,
const paddle::optional<DenseTensor>& d_dx,
const paddle::optional<DenseTensor>& d_dy,
const paddle::optional<DenseTensor>& d_ddout,
DenseTensor* d_x,
DenseTensor* d_y,
DenseTensor* d_ddx,
DenseTensor* d_ddy,
DenseTensor* d_dout) {
if (d_x) {
dev_ctx.template Alloc<T>(d_x);
}
if (d_y) {
dev_ctx.template Alloc<T>(d_y);
}
if (d_ddx) {
dev_ctx.template Alloc<T>(d_ddx);
}
if (d_ddy) {
dev_ctx.template Alloc<T>(d_ddy);
}
if (d_dout) {
dev_ctx.template Alloc<T>(d_dout);
}
DotTripleGradFunction<Context, T>()(dev_ctx,
&x,
&y,
ddx,
ddy,
d_dx,
d_dy,
dout,
d_ddout,
&dout,
ddx.get_ptr(),
ddy.get_ptr(),
d_dx.get_ptr(),
d_dy.get_ptr(),
d_ddout.get_ptr(),
d_x,
d_y,
d_dout,
......
......@@ -473,27 +473,13 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
const paddle::optional<DenseTensor>& ddx_opt,
const paddle::optional<DenseTensor>& ddy_opt,
const paddle::optional<DenseTensor>& ddx,
const paddle::optional<DenseTensor>& ddy,
bool transpose_x,
bool transpose_y,
DenseTensor* dx,
DenseTensor* dy,
DenseTensor* ddout) {
paddle::optional<DenseTensor> ddx;
paddle::optional<DenseTensor> ddy;
if (!ddx_opt && (dy || ddout)) {
DenseTensor ddx_tmp = phi::FullLike<T, Context>(dev_ctx, x, Scalar(0.0));
ddx = paddle::make_optional<DenseTensor>(ddx_tmp);
} else {
ddx = ddx_opt;
}
if (!ddy_opt && (dx || ddout)) {
DenseTensor ddy_tmp = phi::FullLike<T, Context>(dev_ctx, y, Scalar(0.0));
ddy = paddle::make_optional<DenseTensor>(ddy_tmp);
} else {
ddy = ddy_opt;
}
// Get dims from the input x, y, output_grad
std::vector<std::int64_t> x_dims = vectorize(x.dims());
std::vector<std::int64_t> y_dims = vectorize(y.dims());
......@@ -506,7 +492,7 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
// Case1 : x's or y's dim = 1
if (x_ndim == 1 && y_ndim == 1) {
DotDoubleGradFunction<Context, T>()(
dev_ctx, &x, &y, &dout, ddx.get_ptr(), ddy.get_ptr(), dx, dy, ddout);
dev_ctx, &x, &y, &dout, &ddx, &ddy, dx, dy, ddout);
return;
}
......@@ -608,6 +594,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
ddout_flag);
ddout_flag = true;
}
} else if (!ddx && dy) {
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
}
if (ddy) {
auto ddy_mat = ddy.get();
......@@ -666,6 +654,12 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
ddout,
ddout_flag);
}
} else if (!ddy && dx) {
FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
}
if (ddout && !ddx && !ddy) {
FullLikeKernel<T, Context>(
dev_ctx, dout, Scalar(0.0), dout.dtype(), ddout);
}
if (dx) {
......@@ -821,7 +815,7 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
}
}
// Reduce sum to get grad by ReduceSum
if (dx) {
if (dx && dx_help.initialized()) {
if (dx_reduce_dims.empty()) {
*dx = std::move(dx_help);
} else {
......@@ -829,8 +823,10 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
dev_ctx, dx_help, dx, dx_reduce_dims);
}
dx->Resize(x.dims());
} else if (dx && !dx_help.initialized()) {
FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
}
if (dy) {
if (dy && dy_help.initialized()) {
if (dy_reduce_dims.empty()) {
*dy = std::move(dy_help);
} else {
......@@ -838,6 +834,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
dev_ctx, dy_help, dy, dy_reduce_dims);
}
dy->Resize(y.dims());
} else if (dy && !dy_help.initialized()) {
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
}
if (ddout) {
......@@ -873,11 +871,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
const paddle::optional<DenseTensor>& ddx_opt,
const paddle::optional<DenseTensor>& ddy_opt,
const paddle::optional<DenseTensor>& d_dx_opt,
const paddle::optional<DenseTensor>& d_dy_opt,
const paddle::optional<DenseTensor>& d_ddout_opt,
const paddle::optional<DenseTensor>& ddx,
const paddle::optional<DenseTensor>& ddy,
const paddle::optional<DenseTensor>& d_dx,
const paddle::optional<DenseTensor>& d_dy,
const paddle::optional<DenseTensor>& d_ddout,
bool transpose_x,
bool transpose_y,
DenseTensor* out_d_x,
......@@ -885,50 +883,6 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
DenseTensor* out_d_dout,
DenseTensor* out_d_ddx,
DenseTensor* out_d_ddy) {
paddle::optional<DenseTensor> ddx;
paddle::optional<DenseTensor> ddy;
paddle::optional<DenseTensor> d_dx;
paddle::optional<DenseTensor> d_dy;
paddle::optional<DenseTensor> d_ddout;
if (!ddx_opt && (out_d_y || out_d_dout)) {
DenseTensor ddx_tmp =
phi::FullLike<T, Context>(dev_ctx, x, static_cast<T>(0.0));
ddx = paddle::make_optional<DenseTensor>(ddx_tmp);
} else {
ddx = ddx_opt;
}
if (!ddy_opt && (out_d_x || out_d_dout)) {
DenseTensor ddy_tmp =
phi::FullLike<T, Context>(dev_ctx, y, static_cast<T>(0.0));
ddy = paddle::make_optional<DenseTensor>(ddy_tmp);
} else {
ddy = ddy_opt;
}
if (!d_ddout_opt && (out_d_y || out_d_x || out_d_ddy || out_d_ddx)) {
DenseTensor d_ddout_tmp =
phi::FullLike<T, Context>(dev_ctx, dout, static_cast<T>(0.0));
d_ddout = paddle::make_optional<DenseTensor>(d_ddout_tmp);
} else {
d_ddout = d_ddout_opt;
}
if (!d_dx_opt && (out_d_ddy || out_d_dout)) {
DenseTensor d_dx_tmp =
phi::FullLike<T, Context>(dev_ctx, x, static_cast<T>(0.0));
d_dx = paddle::make_optional<DenseTensor>(d_dx_tmp);
} else {
d_dx = d_dx_opt;
}
if (!d_dy_opt && (out_d_ddx || out_d_dout)) {
DenseTensor d_dy_tmp =
phi::FullLike<T, Context>(dev_ctx, y, static_cast<T>(0.0));
d_dy = paddle::make_optional<DenseTensor>(d_dy_tmp);
} else {
d_dy = d_dy_opt;
}
// Get dims from the input x, y, output_grad
std::vector<std::int64_t> x_dims = vectorize(x.dims());
std::vector<std::int64_t> y_dims = vectorize(y.dims());
......@@ -944,12 +898,12 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
DotTripleGradFunction<Context, T>()(dev_ctx,
&x,
&y,
ddx.get_ptr(),
ddy.get_ptr(),
d_dx.get_ptr(),
d_dy.get_ptr(),
&dout,
d_ddout.get_ptr(),
&ddx,
&ddy,
&d_dx,
&d_dy,
&d_ddout,
out_d_x,
out_d_y,
out_d_dout,
......@@ -1047,7 +1001,7 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
if (out_d_ddy_dims != y_help.dims()) {
out_d_ddy->Resize(y_help.dims());
}
if (dout_conj.IsInitialized()) {
if (!dout_conj.IsInitialized()) {
dout_conj = Conj<T>(dev_ctx, dout_help);
}
x_conj = Conj<T>(dev_ctx, x_help);
......@@ -1108,6 +1062,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
out_d_y,
false);
}
} else if (out_d_y) {
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
}
if (out_d_x && ddy) {
if (transpose_x && transpose_y) {
......@@ -1155,6 +1111,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
out_d_x,
false);
}
} else if (out_d_x) {
FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
}
// equations:
......@@ -1269,6 +1227,15 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
}
d_ddy_flag = true;
}
} else {
// d_ddout is none
if (out_d_x) {
FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
}
if (out_d_y) {
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
}
}
if (d_dy) {
......@@ -1439,6 +1406,19 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
out_d_ddy->Resize(out_d_ddy_dims);
}
}
if (out_d_dout && !out_d_dout->IsInitialized()) {
FullLikeKernel<T, Context>(
dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
}
if (out_d_ddx && !out_d_ddx->IsInitialized()) {
FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
}
if (out_d_ddy && !out_d_ddy->IsInitialized()) {
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
}
} else {
// Case3: broadcast. It need cost much time to reduce sum for the
// broadcast and wastes the memory.
......@@ -1585,7 +1565,7 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
}
// Reduce sum to get grad by ReduceSum
if (out_d_x) {
if (out_d_x && out_dx_help.initialized()) {
if (dx_reduce_dims.empty()) {
*out_d_x = std::move(out_dx_help);
} else {
......@@ -1593,9 +1573,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
dev_ctx, out_dx_help, out_d_x, dx_reduce_dims);
}
out_d_x->Resize(x.dims());
} else if (out_d_x) {
FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
}
if (out_d_y) {
if (out_d_y && out_dy_help.initialized()) {
if (dy_reduce_dims.empty()) {
*out_d_y = std::move(out_dy_help);
} else {
......@@ -1603,6 +1585,8 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
dev_ctx, out_dy_help, out_d_y, dy_reduce_dims);
}
out_d_y->Resize(y.dims());
} else if (out_d_y) {
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
}
// compute d_dout
......@@ -1628,6 +1612,11 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
transpose_y,
true);
}
if (!out_d_dout->initialized()) {
FullLikeKernel<T, Context>(
dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
}
}
// compute d_ddx
......@@ -1735,13 +1724,18 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
true);
}
}
if (out_d_ddx_help.initialized()) {
if (dx_reduce_dims.empty()) {
*out_d_ddx = std::move(out_d_ddx_help);
} else {
ReduceSumForMatmulGrad<Context, T>()(
dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims);
}
} else {
FullLikeKernel<T, Context>(
dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
}
out_d_ddx->Resize(x.dims());
}
......@@ -1852,12 +1846,18 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
}
}
if (out_d_ddy_help.initialized()) {
if (dy_reduce_dims.empty()) {
*out_d_ddy = std::move(out_d_ddy_help);
} else {
ReduceSumForMatmulGrad<Context, T>()(
dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims);
}
} else {
FullLikeKernel<T, Context>(
dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
}
out_d_ddy->Resize(y.dims());
}
}
......
......@@ -688,5 +688,489 @@ class TestDoubleGradBasics(TestCase):
np.testing.assert_array_equal(grad_out.grad.numpy(), grad_out_grad_ref)
class TestDygraphDoubleGradMatmul(TestCase):
# case1: ddy is none, no broadcast,dims != 1
def test_matmul_double_grad_case1(self):
input_numpy_x = np.random.random([3, 3]).astype('float32')
input_numpy_y = np.random.random([3, 3]).astype('float32')
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([3, 3]), stop_gradient=False, dtype='float32'
)
(dx,) = paddle.grad(
[out], [x], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
np.ones([3, 3]), stop_gradient=False, dtype='float32'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dx],
[x, y, dout],
[ddx],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.zeros([3, 3], dtype="float32")
dy_double_grad_expected = np.matmul(
np.ones([3, 3], dtype="float32"),
np.ones([3, 3], dtype="float32"),
)
ddout_expected = np.matmul(
np.ones([3, 3], dtype="float32"), input_numpy_y
)
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case2: ddx is none,no broadcast, dims != 1
def test_matmul_double_grad_case2(self):
input_numpy_x = np.random.random([3, 3]).astype('float32')
input_numpy_y = np.random.random([3, 3]).astype('float32')
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([3, 3]), stop_gradient=False, dtype='float32'
)
(dy,) = paddle.grad(
[out], [y], [dout], retain_graph=True, create_graph=True
)
ddy = paddle.to_tensor(
np.ones([3, 3]), stop_gradient=False, dtype='float32'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dy],
[x, y, dout],
[ddy],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.matmul(
np.ones([3, 3], dtype="float32"),
np.ones([3, 3], dtype="float32"),
)
dy_double_grad_expected = np.zeros([3, 3], dtype="float32")
ddout_expected = np.matmul(
input_numpy_x, np.ones([3, 3], dtype="float32")
)
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case3: ddx is none, dims = 1
def test_matmul_double_grad_case3(self):
input_numpy_x = np.random.random([3]).astype('float32')
input_numpy_y = np.random.random([3]).astype('float32')
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([1]), stop_gradient=False, dtype='float32'
)
(dy,) = paddle.grad(
[out], [y], [dout], retain_graph=True, create_graph=True
)
ddy = paddle.to_tensor(
np.ones([3]), stop_gradient=False, dtype='float32'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dy],
[x, y, dout],
[ddy],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.ones([3], dtype="float32")
dy_double_grad_expected = np.zeros([3], dtype="float32")
ddout_expected = np.matmul(
input_numpy_x, np.ones([3], dtype="float32")
)
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case4: ddy is none, dims = 1
def test_matmul_double_grad_case4(self):
input_numpy_x = np.random.random([3]).astype('float32')
input_numpy_y = np.random.random([3]).astype('float32')
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([1]), stop_gradient=False, dtype='float32'
)
(dx,) = paddle.grad(
[out], [x], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
np.ones([3]), stop_gradient=False, dtype='float32'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dx],
[x, y, dout],
[ddx],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.zeros([3], dtype="float32")
dy_double_grad_expected = np.ones([3], dtype="float32")
ddout_expected = np.matmul(
input_numpy_y, np.ones([3], dtype="float32")
)
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case5: ddx is none, broadcast, dims != 1
def test_matmul_double_grad_case5(self):
input_numpy_x = np.random.random([2, 1]).astype('float32')
input_numpy_y = np.random.random([1]).astype('float32')
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([2]), stop_gradient=False, dtype='float32'
)
(dy,) = paddle.grad(
[out], [y], [dout], retain_graph=True, create_graph=True
)
ddy = paddle.to_tensor(
np.ones([1]), stop_gradient=False, dtype='float32'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dy],
[x, y, dout],
[ddy],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.ones([2, 1], dtype="float32")
dy_double_grad_expected = np.zeros([1], dtype="float32")
ddout_expected = np.matmul(
input_numpy_x, np.ones([1], dtype="float32")
)
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case6: ddy is none, broadcast, dims != 1
def test_matmul_double_grad_case6(self):
input_numpy_x = np.random.random([2, 1]).astype('float32')
input_numpy_y = np.random.random([1]).astype('float32')
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([2]), stop_gradient=False, dtype='float32'
)
(dx,) = paddle.grad(
[out], [x], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
np.ones([2, 1]), stop_gradient=False, dtype='float32'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dx],
[x, y, dout],
[ddx],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.zeros([2, 1], dtype="float32")
dy_double_grad_expected = np.ones([1], dtype="float32") * 2
ddout_expected = np.ones([2], dtype="float32") * input_numpy_y[0]
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case7: ddx is none, dims = 1, complex dtype
def test_matmul_double_grad_case7(self):
input_numpy_x = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random([3]).astype('float32')
input_numpy_y = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random([3]).astype('float32')
input_numpy_y_conj = np.conjugate(input_numpy_y)
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='complex64'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='complex64'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([1]), stop_gradient=False, dtype='complex64'
)
(dx,) = paddle.grad(
[out], [x], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
np.ones([3]), stop_gradient=False, dtype='complex64'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dx],
[x, y, dout],
[ddx],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.zeros(
[3], dtype="float32"
) + 0j * np.zeros([3], dtype="float32")
dy_double_grad_expected = np.ones(
[3], dtype="float32"
) + 0j * np.ones([3], dtype="float32")
ddout_expected = np.matmul(
input_numpy_y_conj, np.ones([3], dtype="float32")
)
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case8: ddy is none, dims = 1, complex dtype
def test_matmul_double_grad_case8(self):
input_numpy_x = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random([3]).astype('float32')
input_numpy_y = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random([3]).astype('float32')
input_numpy_x_conj = np.conjugate(input_numpy_x)
def actual():
x = paddle.to_tensor(
input_numpy_x, stop_gradient=False, dtype='complex64'
)
y = paddle.to_tensor(
input_numpy_y, stop_gradient=False, dtype='complex64'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
np.ones([1]), stop_gradient=False, dtype='complex64'
)
(dy,) = paddle.grad(
[out], [y], [dout], retain_graph=True, create_graph=True
)
ddy = paddle.to_tensor(
np.ones([3]), stop_gradient=False, dtype='complex64'
)
dx_double_grad, dy_double_grad, ddout = paddle.grad(
[dy],
[x, y, dout],
[ddy],
retain_graph=True,
create_graph=True,
)
return dx_double_grad, dy_double_grad, ddout
def expected():
dx_double_grad_expected = np.ones([3], dtype="float32")
dy_double_grad_expected = np.zeros([3], dtype="float32")
ddout_expected = np.matmul(
input_numpy_x_conj, np.ones([3], dtype="float32")
)
return (
dx_double_grad_expected,
dy_double_grad_expected,
ddout_expected,
)
expected_results = expected()
places = ["cpu"]
if paddle.is_compiled_with_cuda():
places.append("gpu")
for place in places:
paddle.device.set_device(place)
actual_results = actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
if __name__ == '__main__':
unittest.main()
......@@ -321,5 +321,1020 @@ class TestDygraphTripleGradBradcastCase(TestCase):
fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
# d_ddout is none, dtype is float32
class TestDygraphTripleGradMatmulcase1(TestCase):
def setUp(self):
self.input_numpy_x = None
self.input_numpy_y = None
self.input_numpy_dout = None
self.input_numpy_ddx = None
self.input_numpy_ddy = None
self.places = ["cpu"]
if paddle.is_compiled_with_cuda():
self.places.append("gpu")
def actual(self):
x = paddle.to_tensor(
self.input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
self.input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
self.input_numpy_dout, stop_gradient=False, dtype='float32'
)
(dx, dy) = paddle.grad(
[out], [x, y], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
self.input_numpy_ddx, stop_gradient=False, dtype='float32'
)
ddy = paddle.to_tensor(
self.input_numpy_ddy, stop_gradient=False, dtype='float32'
)
dx_double_grad, dy_double_grad = paddle.grad(
[dx, dy],
[x, y],
[ddx, ddy],
retain_graph=True,
create_graph=True,
)
d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad(
[dx_double_grad, dy_double_grad],
[x, y, dout, ddx, ddy],
retain_graph=False,
create_graph=False,
)
return d_x, d_y, d_dout, d_ddx, d_ddy
# case1: d_ddout is none, dims != 1
def test_matmul_triple_grad_case1(self):
def init_data():
self.input_numpy_x = np.random.random([3, 3]).astype('float32')
self.input_numpy_y = np.random.random([3, 3]).astype('float32')
self.input_numpy_dout = np.ones([3, 3], dtype="float32")
self.input_numpy_ddx = np.ones([3, 3], dtype="float32")
self.input_numpy_ddy = np.ones([3, 3], dtype="float32")
init_data()
d_x_expected = np.zeros([3, 3], dtype="float32")
d_y_expected = np.zeros([3, 3], dtype="float32")
d_dout_expected = np.ones([3, 3], dtype="float32") * 6
d_ddx_expected = np.ones([3, 3], dtype="float32") * 3
d_ddy_expected = np.ones([3, 3], dtype="float32") * 3
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case2: d_ddout is none, dims = 1
def test_matmul_triple_grad_case2(self):
def init_data():
self.input_numpy_x = np.random.random(
[
3,
]
).astype('float32')
self.input_numpy_y = np.random.random(
[
3,
]
).astype('float32')
self.input_numpy_dout = np.ones([1], dtype="float32")
self.input_numpy_ddx = np.ones([3], dtype="float32")
self.input_numpy_ddy = np.ones([3], dtype="float32")
init_data()
d_x_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_y_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_dout_expected = np.ones([1], dtype="float32") * 6
d_ddx_expected = np.ones(
[
3,
],
dtype="float32",
)
d_ddy_expected = np.ones(
[
3,
],
dtype="float32",
)
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# case3: d_ddout is none , with broadcast
def test_matmul_triple_grad_case3(self):
def init_data():
self.input_numpy_x = np.random.random([3, 1]).astype('float32')
self.input_numpy_y = np.random.random(
[
1,
]
).astype('float32')
self.input_numpy_dout = np.ones([3], dtype="float32")
self.input_numpy_ddx = np.ones([3, 1], dtype="float32")
self.input_numpy_ddy = np.ones([1], dtype="float32")
init_data()
d_x_expected = np.zeros([3, 1], dtype="float32")
d_y_expected = np.zeros([1], dtype="float32")
d_dout_expected = (
np.ones(
[
3,
],
dtype="float32",
)
* 2
)
d_ddx_expected = np.ones([3, 1], dtype="float32")
d_ddy_expected = np.ones([1], dtype="float32") * 3
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# d_ddout is none, dtype is complex64
class TestDygraphTripleGradMatmulcase2(TestCase):
def setUp(self):
self.input_numpy_x = None
self.input_numpy_y = None
self.input_numpy_dout = None
self.input_numpy_ddx = None
self.input_numpy_ddy = None
self.input_numpy_ddx_conj = None
self.input_numpy_ddy_conj = None
self.input_numpy_dout_conj = None
self.places = ["cpu"]
if paddle.is_compiled_with_cuda():
self.places.append("gpu")
def actual(self):
x = paddle.to_tensor(
self.input_numpy_x, stop_gradient=False, dtype='complex64'
)
y = paddle.to_tensor(
self.input_numpy_y, stop_gradient=False, dtype='complex64'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
self.input_numpy_dout, stop_gradient=False, dtype='complex64'
)
(dx, dy) = paddle.grad(
[out], [x, y], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
self.input_numpy_ddx, stop_gradient=False, dtype='complex64'
)
ddy = paddle.to_tensor(
self.input_numpy_ddy, stop_gradient=False, dtype='complex64'
)
dx_double_grad, dy_double_grad = paddle.grad(
[dx, dy],
[x, y],
[ddx, ddy],
retain_graph=True,
create_graph=True,
)
d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad(
[dx_double_grad, dy_double_grad],
[x, y, dout, ddx, ddy],
retain_graph=False,
create_graph=False,
)
return d_x, d_y, d_dout, d_ddx, d_ddy
# case1: no d_ddout, dims = 1, dtype is complex64
def test_matmul_triple_grad_case1(self):
def init_data():
self.input_numpy_x = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random(
[
3,
]
).astype(
'float32'
)
self.input_numpy_y = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random(
[
3,
]
).astype(
'float32'
)
self.input_numpy_dout = np.ones(
[
1,
],
dtype="float32",
)
self.input_numpy_ddx = np.ones(
[
3,
],
dtype="float32",
)
self.input_numpy_ddy = np.ones(
[
3,
],
dtype="float32",
)
self.input_numpy_ddx_conj = np.conjugate(self.input_numpy_ddx)
self.input_numpy_ddy_conj = np.conjugate(self.input_numpy_ddy)
self.input_numpy_dout_conj = np.conjugate(self.input_numpy_dout)
init_data()
d_x_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_y_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_dout_expected = np.matmul(
self.input_numpy_ddy_conj,
np.ones(
[
3,
],
dtype="float32",
),
) + np.matmul(
self.input_numpy_ddx_conj,
np.ones(
[
3,
],
dtype="float32",
),
)
d_ddx_expected = (
np.ones(
[
3,
],
dtype="float32",
)
* self.input_numpy_dout_conj[0]
)
d_ddy_expected = (
np.ones(
[
3,
],
dtype="float32",
)
* self.input_numpy_dout_conj[0]
)
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# d_ddout is none, d_dx is none, dtype is float32
class TestDygraphTripleGradMatmulcase3(TestCase):
def setUp(self):
self.input_numpy_x = None
self.input_numpy_y = None
self.input_numpy_dout = None
self.input_numpy_ddx = None
self.input_numpy_ddy = None
self.places = ["cpu"]
if paddle.is_compiled_with_cuda():
self.places.append("gpu")
def actual(self):
x = paddle.to_tensor(
self.input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
self.input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
self.input_numpy_dout, stop_gradient=False, dtype='float32'
)
(dx, dy) = paddle.grad(
[out], [x, y], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
self.input_numpy_ddx, stop_gradient=False, dtype='float32'
)
ddy = paddle.to_tensor(
self.input_numpy_ddy, stop_gradient=False, dtype='float32'
)
(dy_double_grad,) = paddle.grad(
[dx, dy],
[y],
[ddx, ddy],
retain_graph=True,
create_graph=True,
)
d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad(
[dy_double_grad],
[x, y, dout, ddx, ddy],
retain_graph=False,
create_graph=False,
)
return d_x, d_y, d_dout, d_ddx, d_ddy
# case1: d_ddout is none, d_dx is none, dims != 1
def test_matmul_triple_grad_case1(self):
def init_data():
self.input_numpy_x = np.random.random([3, 3]).astype('float32')
self.input_numpy_y = np.random.random([3, 3]).astype('float32')
self.input_numpy_dout = np.ones([3, 3], dtype="float32")
self.input_numpy_ddx = np.ones([3, 3], dtype="float32")
self.input_numpy_ddy = np.ones([3, 3], dtype="float32")
init_data()
d_x_expected = np.zeros([3, 3], dtype="float32")
d_y_expected = np.zeros([3, 3], dtype="float32")
d_dout_expected = np.ones([3, 3], dtype="float32") * 3
d_ddx_expected = np.ones([3, 3], dtype="float32") * 3
d_ddy_expected = np.zeros([3, 3], dtype="float32")
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# #case2: d_ddout is none, d_dx is none, dims = 1
def test_matmul_triple_grad_case2(self):
def init_data():
self.input_numpy_x = np.random.random(
[
3,
]
).astype('float32')
self.input_numpy_y = np.random.random(
[
3,
]
).astype('float32')
self.input_numpy_dout = np.ones([1], dtype="float32")
self.input_numpy_ddx = np.ones([3], dtype="float32")
self.input_numpy_ddy = np.ones([3], dtype="float32")
init_data()
d_x_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_y_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_dout_expected = np.ones([1], dtype="float32") * 3
d_ddx_expected = np.ones(
[
3,
],
dtype="float32",
)
d_ddy_expected = np.zeros(
[
3,
],
dtype="float32",
)
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# #case3: d_ddout is none, d_dx is none , with broadcast
def test_matmul_triple_grad_case3(self):
def init_data():
self.input_numpy_x = np.random.random([3, 1]).astype('float32')
self.input_numpy_y = np.random.random(
[
1,
]
).astype('float32')
self.input_numpy_dout = np.ones([3], dtype="float32")
self.input_numpy_ddx = np.ones([3, 1], dtype="float32")
self.input_numpy_ddy = np.ones([1], dtype="float32")
init_data()
d_x_expected = np.zeros([3, 1], dtype="float32")
d_y_expected = np.zeros([1], dtype="float32")
d_dout_expected = np.ones(
[
3,
],
dtype="float32",
)
d_ddx_expected = np.ones([3, 1], dtype="float32")
d_ddy_expected = np.zeros([1], dtype="float32")
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# d_ddout is none, d_dx is none, dtype is complex64
class TestDygraphTripleGradMatmulcase4(TestCase):
def setUp(self):
self.input_numpy_x = None
self.input_numpy_y = None
self.input_numpy_dout = None
self.input_numpy_ddx = None
self.input_numpy_ddy = None
self.input_numpy_ddx_conj = None
self.input_numpy_dout_conj = None
self.places = ["cpu"]
if paddle.is_compiled_with_cuda():
self.places.append("gpu")
def actual(self):
x = paddle.to_tensor(
self.input_numpy_x, stop_gradient=False, dtype='complex64'
)
y = paddle.to_tensor(
self.input_numpy_y, stop_gradient=False, dtype='complex64'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
self.input_numpy_dout, stop_gradient=False, dtype='complex64'
)
(dx, dy) = paddle.grad(
[out], [x, y], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
self.input_numpy_ddx, stop_gradient=False, dtype='complex64'
)
ddy = paddle.to_tensor(
self.input_numpy_ddy, stop_gradient=False, dtype='complex64'
)
(dy_double_grad,) = paddle.grad(
[dx, dy],
[y],
[ddx, ddy],
retain_graph=True,
create_graph=True,
)
d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad(
[dy_double_grad],
[x, y, dout, ddx, ddy],
retain_graph=False,
create_graph=False,
)
return d_x, d_y, d_dout, d_ddx, d_ddy
# case1: no d_ddout,no d_dx, dims = 1
def test_matmul_triple_grad_case1(self):
def init_data():
self.input_numpy_x = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random(
[
3,
]
).astype(
'float32'
)
self.input_numpy_y = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random(
[
3,
]
).astype(
'float32'
)
self.input_numpy_dout = np.ones(
[
1,
],
dtype="float32",
)
self.input_numpy_ddx = np.ones(
[
3,
],
dtype="float32",
)
self.input_numpy_ddy = np.ones(
[
3,
],
dtype="float32",
)
self.input_numpy_ddx_conj = np.conjugate(self.input_numpy_ddx)
self.input_numpy_dout_conj = np.conjugate(self.input_numpy_dout)
init_data()
d_x_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_y_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_dout_expected = np.matmul(
self.input_numpy_ddx_conj,
np.ones(
[
3,
],
dtype="float32",
),
)
d_ddx_expected = (
np.ones(
[
3,
],
dtype="float32",
)
* self.input_numpy_dout_conj[0]
)
d_ddy_expected = np.zeros(
[
3,
],
dtype="float32",
)
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# d_ddout is none, d_dy is none, dtype is float32
class TestDygraphTripleGradMatmulcase5(TestCase):
def setUp(self):
self.input_numpy_x = None
self.input_numpy_y = None
self.input_numpy_dout = None
self.input_numpy_ddx = None
self.input_numpy_ddy = None
self.places = ["cpu"]
if paddle.is_compiled_with_cuda():
self.places.append("gpu")
def actual(self):
x = paddle.to_tensor(
self.input_numpy_x, stop_gradient=False, dtype='float32'
)
y = paddle.to_tensor(
self.input_numpy_y, stop_gradient=False, dtype='float32'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
self.input_numpy_dout, stop_gradient=False, dtype='float32'
)
(dx, dy) = paddle.grad(
[out], [x, y], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
self.input_numpy_ddx, stop_gradient=False, dtype='float32'
)
ddy = paddle.to_tensor(
self.input_numpy_ddy, stop_gradient=False, dtype='float32'
)
(dx_double_grad,) = paddle.grad(
[dx, dy],
[x],
[ddx, ddy],
retain_graph=True,
create_graph=True,
)
d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad(
[dx_double_grad],
[x, y, dout, ddx, ddy],
retain_graph=False,
create_graph=False,
)
return d_x, d_y, d_dout, d_ddx, d_ddy
# case1: d_ddout is none, d_dy is none, dims != 1
def test_matmul_triple_grad_case1(self):
def init_data():
self.input_numpy_x = np.random.random([3, 3]).astype('float32')
self.input_numpy_y = np.random.random([3, 3]).astype('float32')
self.input_numpy_dout = np.ones([3, 3], dtype="float32")
self.input_numpy_ddx = np.ones([3, 3], dtype="float32")
self.input_numpy_ddy = np.ones([3, 3], dtype="float32")
init_data()
d_x_expected = np.zeros([3, 3], dtype="float32")
d_y_expected = np.zeros([3, 3], dtype="float32")
d_dout_expected = np.ones([3, 3], dtype="float32") * 3
d_ddx_expected = np.zeros([3, 3], dtype="float32") * 3
d_ddy_expected = np.ones([3, 3], dtype="float32") * 3
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# #case2: d_ddout is none, d_dy is none, dims = 1
def test_matmul_triple_grad_case2(self):
def init_data():
self.input_numpy_x = np.random.random(
[
3,
]
).astype('float32')
self.input_numpy_y = np.random.random(
[
3,
]
).astype('float32')
self.input_numpy_dout = np.ones([1], dtype="float32")
self.input_numpy_ddx = np.ones([3], dtype="float32")
self.input_numpy_ddy = np.ones([3], dtype="float32")
init_data()
d_x_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_y_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_dout_expected = np.ones([1], dtype="float32") * 3
d_ddx_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_ddy_expected = np.ones(
[
3,
],
dtype="float32",
)
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# #case3: d_ddout is none, d_dy is none , with broadcast
def test_matmul_triple_grad_case3(self):
def init_data():
self.input_numpy_x = np.random.random([3, 1]).astype('float32')
self.input_numpy_y = np.random.random(
[
1,
]
).astype('float32')
self.input_numpy_dout = np.ones([3], dtype="float32")
self.input_numpy_ddx = np.ones([3, 1], dtype="float32")
self.input_numpy_ddy = np.ones([1], dtype="float32")
init_data()
d_x_expected = np.zeros([3, 1], dtype="float32")
d_y_expected = np.zeros([1], dtype="float32")
d_dout_expected = np.ones(
[
3,
],
dtype="float32",
)
d_ddx_expected = np.zeros([3, 1], dtype="float32")
d_ddy_expected = np.ones([1], dtype="float32") * 3
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
# d_ddout is none, d_dy is none, dtype is complex64
class TestDygraphTripleGradMatmulcase6(TestCase):
def setUp(self):
self.input_numpy_x = None
self.input_numpy_y = None
self.input_numpy_dout = None
self.input_numpy_ddx = None
self.input_numpy_ddy = None
self.input_numpy_ddy_conj = None
self.input_numpy_dout_conj = None
self.places = ["cpu"]
if paddle.is_compiled_with_cuda():
self.places.append("gpu")
def actual(self):
x = paddle.to_tensor(
self.input_numpy_x, stop_gradient=False, dtype='complex64'
)
y = paddle.to_tensor(
self.input_numpy_y, stop_gradient=False, dtype='complex64'
)
out = paddle.matmul(x, y, False, False)
dout = paddle.to_tensor(
self.input_numpy_dout, stop_gradient=False, dtype='complex64'
)
(dx, dy) = paddle.grad(
[out], [x, y], [dout], retain_graph=True, create_graph=True
)
ddx = paddle.to_tensor(
self.input_numpy_ddx, stop_gradient=False, dtype='complex64'
)
ddy = paddle.to_tensor(
self.input_numpy_ddy, stop_gradient=False, dtype='complex64'
)
(dx_double_grad,) = paddle.grad(
[dx, dy],
[x],
[ddx, ddy],
retain_graph=True,
create_graph=True,
)
d_x, d_y, d_dout, d_ddx, d_ddy = paddle.grad(
[dx_double_grad],
[x, y, dout, ddx, ddy],
retain_graph=False,
create_graph=False,
)
return d_x, d_y, d_dout, d_ddx, d_ddy
# case1: no d_ddout,no d_dy, dims = 1
def test_matmul_triple_grad_case1(self):
def init_data():
self.input_numpy_x = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random(
[
3,
]
).astype(
'float32'
)
self.input_numpy_y = np.random.random([3]).astype(
'float32'
) + 1j * np.random.random(
[
3,
]
).astype(
'float32'
)
self.input_numpy_dout = np.ones(
[
1,
],
dtype="float32",
)
self.input_numpy_ddx = np.ones(
[
3,
],
dtype="float32",
)
self.input_numpy_ddy = np.ones(
[
3,
],
dtype="float32",
)
self.input_numpy_ddy_conj = np.conjugate(self.input_numpy_ddy)
self.input_numpy_dout_conj = np.conjugate(self.input_numpy_dout)
init_data()
d_x_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_y_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_dout_expected = np.matmul(
self.input_numpy_ddy_conj,
np.ones(
[
3,
],
dtype="float32",
),
)
d_ddx_expected = np.zeros(
[
3,
],
dtype="float32",
)
d_ddy_expected = (
np.ones(
[
3,
],
dtype="float32",
)
* self.input_numpy_dout_conj[0]
)
expected_results = (
d_x_expected,
d_y_expected,
d_dout_expected,
d_ddx_expected,
d_ddy_expected,
)
for place in self.places:
paddle.device.set_device(place)
actual_results = self.actual()
for expected_result, actual_result in zip(
expected_results, actual_results
):
np.testing.assert_allclose(
expected_result, actual_result, rtol=1e-6
)
if __name__ == '__main__':
unittest.main()
......@@ -179,7 +179,9 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
^test_parallel_executor_seresnext_with_reduce_gpu$|\
^test_api_impl$|\
^test_tensordot$|\
^disable_win_inference_test$"
^disable_win_inference_test$|\
^test_imperative_double_grad$|\
^test_imperative_triple_grad$"
# /*==========Fixed Disabled Windows CPU OPENBLAS((PR-CI-Windows-OPENBLAS)) unittests==============================*/
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册