未验证 提交 c52a664e 编写于 作者: Y YuanRisheng 提交者: GitHub

[Phi]Move elementwise_div grad/double grad Kernel to Phi (#40172)

* move elementwise_div grad

* change mutable_data to alloc

* fix compile bugs
上级 0fb6bca4
......@@ -54,7 +54,7 @@ USE_OP(slice_grad);
USE_OP(lookup_table_grad);
USE_OP(sqrt);
USE_OP(elementwise_max);
USE_OP(elementwise_div);
USE_OP_ITSELF(elementwise_div);
USE_OP(sgd);
USE_OP(squared_l2_norm);
USE_OP(memcpy_h2d);
......
......@@ -102,42 +102,6 @@ REGISTER_OPERATOR(
REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad,
ops::ElementwiseDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(
elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
elementwise_div_grad_grad,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
float>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
double>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
int>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
int64_t>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_VERSION(elementwise_div)
.AddCheckpoint(
R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC",
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
ElementwiseDivGrad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out, const framework::Tensor* dout,
framework::Tensor* dx, framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
const auto& dev_ctx = ctx.template device_context<DeviceContext>();
const auto place = ctx.GetPlace();
if (dx != nullptr && dy != nullptr) {
std::vector<const framework::Tensor*> ins = {dout, out, y};
GetGradXAndYOut<ElementwiseType::kTernary, T>(
dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
} else if (dx != nullptr && dy == nullptr) {
std::vector<const framework::Tensor*> ins = {dout, y};
GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
dx, DivGradXFunctor<T>());
} else if (dy != nullptr && dx == nullptr) {
std::vector<const framework::Tensor*> ins = {dout, out, y};
GetGradXOrYOut<ElementwiseType::kTernary, T>(
dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor<T>());
}
}
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
paddle::platform::bfloat16>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::bfloat16>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad_grad,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
float>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::bfloat16>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
double>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
int>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
int64_t>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
......@@ -20,142 +20,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
void default_elementwise_sub(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims();
auto y_dims = y->dims();
if (x_dims.size() >= y_dims.size()) {
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
SubFunctor<T>(), z);
} else {
ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, InverseSubFunctor<T>(), z);
}
}
template <typename DeviceContext, typename T>
void default_elementwise_div(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims();
auto y_dims = y->dims();
if (x_dims.size() >= y_dims.size()) {
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
DivFunctor<T>(), z);
} else {
ElementwiseComputeEx<InverseDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, InverseDivFunctor<T>(), z);
}
}
template <typename DeviceContext, typename T>
class ElementwiseDivKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.device_context<DeviceContext>();
int axis = ctx.Attr<int>("axis");
auto pt_x = paddle::experimental::MakePhiDenseTensor(*x);
auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
auto pt_z = paddle::experimental::MakePhiDenseTensor(*z);
phi::DivideRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*pt_x.get(), *pt_y.get(), axis, pt_z.get());
}
};
template <typename T>
struct DivGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
};
template <typename T>
struct DivGradDX<paddle::platform::complex<T>> {
HOSTDEVICE paddle::platform::complex<T> operator()(
paddle::platform::complex<T> x, paddle::platform::complex<T> y,
paddle::platform::complex<T> out,
paddle::platform::complex<T> dout) const {
paddle::platform::complex<T> y_conj(y.real, -y.imag);
return dout / y_conj;
}
};
template <typename T>
struct DivGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return -dout * out / y;
}
};
template <typename T>
struct DivGradDY<paddle::platform::complex<T>> {
HOSTDEVICE paddle::platform::complex<T> operator()(
paddle::platform::complex<T> x, paddle::platform::complex<T> y,
paddle::platform::complex<T> out,
paddle::platform::complex<T> dout) const {
paddle::platform::complex<T> out_div_y_conj((out / y).real,
-(out / y).imag);
return -dout * out_div_y_conj;
}
};
template <typename T>
struct DivDoubleDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return y * out * dout - x * dout;
}
};
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
ElementwiseDivGrad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out, const framework::Tensor* dout,
framework::Tensor* dx, framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
ElementwiseDivGrad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out, const framework::Tensor* dout,
framework::Tensor* dx, framework::Tensor* dy);
#endif
template <typename DeviceContext, typename T>
class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
ElementwiseDivGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
}
};
class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
}
};
template <typename DeviceContext, typename T>
class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
using Tensor = framework::Tensor;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* Y = ctx.Input<Tensor>("Y");
auto* Out = ctx.Input<Tensor>("Out");
auto* ddX = ctx.Input<Tensor>("DDX");
auto* ddY = ctx.Input<Tensor>("DDY");
auto* dX = ctx.Input<Tensor>("DX");
auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* dOut = ctx.Output<Tensor>("DOut");
auto* ddOut = ctx.Output<Tensor>("DDOut");
int axis = ctx.Attr<int>("axis");
if (dY) dY->mutable_data<T>(Y->dims(), ctx.GetPlace());
if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
// ddX_safe == null ? 0 : ddX
// ddY_safe == null ? 0 : ddY
Tensor ddX_safe, ddY_safe;
GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dX, ddX, &ddX_safe);
GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);
// ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
// dY = Out * dX * ddY / Y - dX * ddX / Y
// dOut = - dX * ddY
// To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
// inplace ddx
Tensor tmp;
if (dOut) {
tmp = *dOut;
} else {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
tmp = ctx.AllocateTmpTensor<T, DeviceContext>(Out->dims(), dev_ctx);
}
if (dY) {
// dX_div_Y = dX / Y;
Tensor dX_div_Y = tmp;
default_elementwise_div<DeviceContext, T>(ctx, dX, Y, &dX_div_Y);
// NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
// first output tensor is nullptr, the branch to calculate first
// output tensor will not be activated, DivGradDx function will not
// be called and can be ignored, the first branch has little effect
// on running speed.
// dY = Out * dX * ddY / Y - dX * ddX / Y
ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivDoubleDY<T>>(
ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY,
DivGradDX<T>(), DivDoubleDY<T>());
}
if (ddOut) {
// ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, &tmp);
default_elementwise_sub<DeviceContext, T>(ctx, &ddX_safe, &tmp, &tmp);
default_elementwise_div<DeviceContext, T>(ctx, &tmp, Y, ddOut);
}
if (dOut) {
// dOut = - dX * ddY
default_elementwise_mul<DeviceContext, T>(ctx, dX, &ddY_safe, dOut);
auto& place =
*ctx.template device_context<DeviceContext>().eigen_device();
auto dout = framework::EigenVector<T>::Flatten(*dOut);
dout.device(place) = static_cast<T>(-1) * dout;
}
}
};
} // namespace operators
} // namespace paddle
......@@ -90,67 +90,6 @@ struct MinFunctor {
template <typename T>
using Complex = paddle::platform::complex<T>;
template <typename InT, typename OutT>
struct DivGradXYFunctor {
inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
const InT c) {
// dx = dout / y
// dy = - dout * out / y
phi::Array<OutT, 2> outs;
outs[0] = a / c;
outs[1] = -a * b / c;
return outs;
}
};
template <typename InT, typename OutT>
struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
phi::Array<Complex<OutT>, 2> outs;
Complex<InT> c_conj(c.real, -c.imag);
Complex<InT> out_div_c_conj((b / c).real, -(b / c).imag);
outs[0] = a / c_conj;
outs[1] = -a * out_div_c_conj;
return outs;
}
};
// Float div grad
template <typename T>
struct DivGradXFunctor {
inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
};
// Complex div grad
template <typename T>
struct DivGradXFunctor<Complex<T>> {
inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
const Complex<T> b) const {
Complex<T> b_conj(b.real, -b.imag);
return a / b_conj;
}
};
// Float mul and div
template <typename T>
struct DivGradYFunctor {
inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
return -a * b / c;
}
};
// Complex mul and div
template <typename T>
struct DivGradYFunctor<Complex<T>> {
inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
const Complex<T> b,
const Complex<T> c) const {
Complex<T> out_div_c_conj((b / c).real, -(b / c).imag);
return -a * out_div_c_conj;
}
};
// Fmax
template <typename T>
struct FMaxFunctor {
......
......@@ -45,6 +45,7 @@ limitations under the License. */
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
#endif
......@@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
const framework::Tensor &dout, int axis,
framework::Tensor *dx, framework::Tensor *dy,
DX_OP dx_op, DY_OP dy_op) {
const framework::DDim &x_dim = x.dims();
const framework::DDim &y_dim = y.dims();
const auto &dev_ctx = ctx.template device_context<DeviceContext>();
if (x.dims() == y.dims()) {
phi::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
Tout>(
dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else {
phi::funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
}
phi::funcs::ElemwiseGradCompute<DeviceContext, T, DX_OP, DY_OP, Tout>(
dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
}
// It is a common implementation to compute binary calculation with the support
......@@ -1174,14 +1167,6 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
}
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T>
void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework::Tensor *src, framework::Tensor *dst) {
std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
dev_ctx.stream());
}
template <ElementwiseType ET, typename T, typename Functor>
void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
......@@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
std::vector<const framework::Tensor *> ins,
const framework::Tensor *dout, framework::Tensor *dx,
framework::Tensor *dy, Functor func) {
framework::Tensor tmp_dx;
framework::Tensor tmp_dy;
dx->mutable_data<T>(place);
dy->mutable_data<T>(place);
std::vector<framework::Tensor *> outs;
if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
outs = {dx, dy};
} else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
tmp_dx.mutable_data<T>(dout->dims(), place);
outs = {&tmp_dx, dy};
} else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
tmp_dy.mutable_data<T>(dout->dims(), place);
outs = {dx, &tmp_dy};
} else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
tmp_dy.mutable_data<T>(dout->dims(), place);
tmp_dx.mutable_data<T>(dout->dims(), place);
outs = {&tmp_dx, &tmp_dy};
}
paddle::operators::LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(
dev_ctx, ins, &outs, axis, func);
if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
} else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
} else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
}
phi::GetGradXAndYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dx, dy,
func);
}
template <ElementwiseType ET, typename T, typename Functor>
......@@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
std::vector<const framework::Tensor *> ins,
const framework::Tensor *dout, framework::Tensor *dxy,
Functor func) {
framework::Tensor tmp_dxy;
dxy->mutable_data<T>(place);
std::vector<framework::Tensor *> outs;
if (dxy->dims() != dout->dims()) {
tmp_dxy.mutable_data<T>(dout->dims(), place);
outs = {&tmp_dxy};
} else {
outs = {dxy};
}
paddle::operators::LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs,
axis, func);
if (dxy->dims() != dout->dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
}
phi::GetGradXOrYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dxy,
func);
}
#endif
......
......@@ -28,7 +28,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
USE_OP(elementwise_div);
USE_OP_ITSELF(elementwise_div);
namespace paddle {
namespace operators {
......
......@@ -18,7 +18,6 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/cpu/elementwise_grad.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......@@ -108,6 +107,20 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
}
template <typename T, typename Context>
void DivideGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
int axis,
DenseTensor* dx,
DenseTensor* dy) {
funcs::ElementwiseGradPreProcess(dout, dx);
phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
}
} // namespace phi
PD_REGISTER_KERNEL(add_grad,
......@@ -171,3 +184,25 @@ PD_REGISTER_KERNEL(subtract_double_grad,
phi::dtype::bfloat16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(divide_grad,
CPU,
ALL_LAYOUT,
phi::DivideGradKernel,
float,
double,
int,
int64_t,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {}
PD_REGISTER_KERNEL(divide_double_grad,
CPU,
ALL_LAYOUT,
phi::DivideDoubleGradKernel,
float,
double,
int,
int64_t,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {}
......@@ -64,4 +64,25 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
int axis,
DenseTensor* ddout);
template <typename T, typename Context>
void DivideGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
int axis,
DenseTensor* dx,
DenseTensor* dy);
template <typename T, typename Context>
void DivideDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dx,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
int axis,
DenseTensor* dy,
DenseTensor* dout,
DenseTensor* ddout);
} // namespace phi
......@@ -592,5 +592,25 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
#endif
template <typename DeviceContext,
typename T,
typename Functor,
typename InverseFunctor>
void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
DenseTensor *z,
int axis = -1) {
auto x_dims = x.dims();
auto y_dims = y.dims();
dev_ctx.template Alloc<T>(z);
if (x_dims.size() >= y_dims.size()) {
funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
} else {
funcs::ElementwiseCompute<InverseFunctor, T>(
dev_ctx, x, y, axis, InverseFunctor(), z);
}
}
} // namespace funcs
} // namespace phi
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/hostdevice.h"
......@@ -92,5 +93,72 @@ struct InverseDivideFunctor {
inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; }
};
template <typename T>
using ComplexType = phi::dtype::complex<T>;
template <typename InT, typename OutT>
struct DivGradXYFunctor {
inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
const InT b,
const InT c) {
// dx = dout / y
// dy = - dout * out / y
phi::Array<OutT, 2> outs;
outs[0] = a / c;
outs[1] = -a * b / c;
return outs;
}
};
template <typename InT, typename OutT>
struct DivGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
const ComplexType<InT> a,
const ComplexType<InT> b,
const ComplexType<InT> c) {
phi::Array<ComplexType<OutT>, 2> outs;
ComplexType<InT> c_conj(c.real, -c.imag);
ComplexType<InT> out_div_c_conj((b / c).real, -(b / c).imag);
outs[0] = a / c_conj;
outs[1] = -a * out_div_c_conj;
return outs;
}
};
// Float div grad
template <typename T>
struct DivGradXFunctor {
inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
};
// ComplexType div grad
template <typename T>
struct DivGradXFunctor<ComplexType<T>> {
inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
const ComplexType<T> b) const {
ComplexType<T> b_conj(b.real, -b.imag);
return a / b_conj;
}
};
// Float mul and div
template <typename T>
struct DivGradYFunctor {
inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
return -a * b / c;
}
};
// ComplexType mul and div
template <typename T>
struct DivGradYFunctor<ComplexType<T>> {
inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
const ComplexType<T> b,
const ComplexType<T> c) const {
ComplexType<T> out_div_c_conj((b / c).real, -(b / c).imag);
return -a * out_div_c_conj;
}
};
} // namespace funcs
} // namespace phi
......@@ -24,6 +24,7 @@ limitations under the License. */
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
#endif
......@@ -1758,5 +1759,31 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
#endif
template <typename DeviceContext,
typename T,
typename DX_OP,
typename DY_OP,
typename Tout = T>
void ElemwiseGradCompute(const DeviceContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
int axis,
DenseTensor *dx,
DenseTensor *dy,
DX_OP dx_op,
DY_OP dy_op) {
const DDim &x_dim = x.dims();
const DDim &y_dim = y.dims();
if (x.dims() == y.dims()) {
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else {
ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
}
}
} // namespace funcs
} // namespace phi
......@@ -14,12 +14,101 @@ limitations under the License. */
#pragma once
#include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
namespace phi {
template <typename T>
void ReduceWrapper(const GPUContext &dev_ctx,
int axis,
DenseTensor *src,
DenseTensor *dst) {
std::vector<int> reduce_dims =
funcs::GetReduceDim(dst->dims(), src->dims(), axis);
funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx,
*src,
dst,
kps::IdentityFunctor<T>(),
reduce_dims,
dev_ctx.stream());
}
template <ElementwiseType ET, typename T, typename Functor>
void GetGradXAndYOut(const GPUContext &dev_ctx,
const Place &place,
int axis,
std::vector<const DenseTensor *> ins,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy,
Functor func) {
DenseTensor tmp_dx;
DenseTensor tmp_dy;
dev_ctx.Alloc<T>(dx);
dev_ctx.Alloc<T>(dy);
std::vector<DenseTensor *> outs;
if (dx->dims() == dout.dims() && dy->dims() == dout.dims()) {
outs = {dx, dy};
} else if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
tmp_dx.Resize(dout.dims());
dev_ctx.Alloc<T>(&tmp_dx);
outs = {&tmp_dx, dy};
} else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
tmp_dy.Resize(dout.dims());
dev_ctx.Alloc<T>(&tmp_dy);
outs = {dx, &tmp_dy};
} else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
tmp_dy.Resize(dout.dims());
dev_ctx.Alloc<T>(&tmp_dy);
tmp_dx.Resize(dout.dims());
dev_ctx.Alloc<T>(&tmp_dx);
outs = {&tmp_dx, &tmp_dy};
}
funcs::BroadcastKernel<ET, T, T, decltype(func), 2>(
dev_ctx, ins, &outs, axis, func);
if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
} else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
} else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
}
}
template <ElementwiseType ET, typename T, typename Functor>
void GetGradXOrYOut(const GPUContext &dev_ctx,
const Place &place,
int axis,
std::vector<const DenseTensor *> ins,
const DenseTensor &dout,
DenseTensor *dxy,
Functor func) {
DenseTensor tmp_dxy;
dev_ctx.Alloc<T>(dxy);
std::vector<DenseTensor *> outs;
if (dxy->dims() != dout.dims()) {
tmp_dxy.Resize(dout.dims());
dev_ctx.Alloc<T>(&tmp_dxy);
outs = {&tmp_dxy};
} else {
outs = {dxy};
}
funcs::BroadcastKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
if (dxy->dims() != dout.dims()) {
ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
}
}
/*
******************************
Add Grad
......@@ -243,4 +332,41 @@ void elementwise_sub_grad(const GPUContext &ctx,
dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
}
/*
******************************
Div Grad
******************************
*/
template <typename T>
void ElementwiseDivGrad(const GPUContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy,
int axis = -1) {
const auto place = dev_ctx.GetPlace();
if (dx != nullptr && dy != nullptr) {
std::vector<const DenseTensor *> ins = {&dout, &out, &y};
GetGradXAndYOut<ElementwiseType::kTernary, T>(
dev_ctx,
place,
axis,
ins,
dout,
dx,
dy,
funcs::DivGradXYFunctor<T, T>());
} else if (dx != nullptr && dy == nullptr) {
std::vector<const DenseTensor *> ins = {&dout, &y};
GetGradXOrYOut<ElementwiseType::kBinary, T>(
dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
} else if (dy != nullptr && dx == nullptr) {
std::vector<const DenseTensor *> ins = {&dout, &out, &y};
GetGradXOrYOut<ElementwiseType::kTernary, T>(
dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
}
}
} // namespace phi
......@@ -15,9 +15,11 @@
#include "paddle/phi/kernels/elementwise_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......@@ -102,6 +104,38 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
}
template <typename T, typename Context>
void DivideGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
int axis,
DenseTensor* dx,
DenseTensor* dy) {
const auto place = dev_ctx.GetPlace();
if (dx != nullptr && dy != nullptr) {
std::vector<const DenseTensor*> ins = {&dout, &out, &y};
GetGradXAndYOut<ElementwiseType::kTernary, T>(
dev_ctx,
place,
axis,
ins,
dout,
dx,
dy,
funcs::DivGradXYFunctor<T, T>());
} else if (dx != nullptr && dy == nullptr) {
std::vector<const DenseTensor*> ins = {&dout, &y};
GetGradXOrYOut<ElementwiseType::kBinary, T>(
dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
} else if (dy != nullptr && dx == nullptr) {
std::vector<const DenseTensor*> ins = {&dout, &out, &y};
GetGradXOrYOut<ElementwiseType::kTernary, T>(
dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
}
}
} // namespace phi
PD_REGISTER_KERNEL(add_grad,
......@@ -168,3 +202,29 @@ PD_REGISTER_KERNEL(subtract_double_grad,
phi::dtype::bfloat16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(divide_grad,
GPU,
ALL_LAYOUT,
phi::DivideGradKernel,
float,
phi::dtype::float16,
phi::dtype::bfloat16,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(divide_double_grad,
GPU,
ALL_LAYOUT,
phi::DivideDoubleGradKernel,
float,
phi::dtype::float16,
phi::dtype::bfloat16,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -14,8 +14,11 @@ limitations under the License. */
#pragma once
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
namespace phi {
......@@ -103,4 +106,157 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
}
}
/*
******************************
Divide Grad
******************************
*/
template <typename T>
struct DivGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
};
template <typename T>
struct DivGradDX<phi::dtype::complex<T>> {
HOSTDEVICE phi::dtype::complex<T> operator()(
phi::dtype::complex<T> x,
phi::dtype::complex<T> y,
phi::dtype::complex<T> out,
phi::dtype::complex<T> dout) const {
phi::dtype::complex<T> y_conj(y.real, -y.imag);
return dout / y_conj;
}
};
template <typename T>
struct DivGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return -dout * out / y;
}
};
template <typename T>
struct DivGradDY<paddle::platform::complex<T>> {
HOSTDEVICE phi::dtype::complex<T> operator()(
phi::dtype::complex<T> x,
phi::dtype::complex<T> y,
phi::dtype::complex<T> out,
phi::dtype::complex<T> dout) const {
phi::dtype::complex<T> out_div_y_conj((out / y).real, -(out / y).imag);
return -dout * out_div_y_conj;
}
};
template <typename T>
struct DivDoubleDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return y * out * dout - x * dout;
}
};
template <typename T, typename Context>
void DivideDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dx,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
int axis,
DenseTensor* dy,
DenseTensor* dout,
DenseTensor* ddout) {
if (dy) {
dy->Resize(y.dims());
dev_ctx.template Alloc<T>(dy);
}
if (dout) {
dout->Resize(out.dims());
dev_ctx.template Alloc<T>(dout);
}
if (ddout) {
ddout->Resize(out.dims());
dev_ctx.template Alloc<T>(ddout);
}
// ddX_safe == null ? 0 : ddX
// ddY_safe == null ? 0 : ddY
DenseTensor ddX_safe, ddY_safe;
phi::funcs::GetDoubleGradSafeTensor<Context, T>(
dev_ctx, dx, ddx.get_ptr(), &ddX_safe);
phi::funcs::GetDoubleGradSafeTensor<Context, T>(
dev_ctx, y, ddy.get_ptr(), &ddY_safe);
// ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
// dY = Out * dX * ddY / Y - dX * ddX / Y
// dOut = - dX * ddY
// To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
// inplace ddx
DenseTensor tmp;
if (dout) {
tmp = *dout;
} else {
tmp.Resize(out.dims());
dev_ctx.template Alloc<T>(&tmp);
}
if (dy) {
// dX_div_Y = dX / Y;
DenseTensor dX_div_Y = tmp;
funcs::DefaultElementwiseOperator<Context,
T,
funcs::DivideFunctor<T>,
funcs::InverseDivideFunctor<T>>(
dev_ctx, dx, y, &dX_div_Y, axis);
// NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
// first output tensor is nullptr, the branch to calculate first
// output tensor will not be activated, DivGradDx function will not
// be called and can be ignored, the first branch has little effect
// on running speed.
// dY = Out * dX * ddY / Y - dX * ddX / Y
phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
dev_ctx,
ddX_safe,
ddY_safe,
out,
dX_div_Y,
axis,
nullptr,
dy,
DivGradDX<T>(),
DivDoubleDY<T>());
}
if (ddout) {
// ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
funcs::DefaultElementwiseOperator<Context,
T,
funcs::MultiplyFunctor<T>,
funcs::InverseMultiplyFunctor<T>>(
dev_ctx, out, ddY_safe, &tmp, axis);
funcs::DefaultElementwiseOperator<Context,
T,
funcs::SubtractFunctor<T>,
funcs::InverseSubtractFunctor<T>>(
dev_ctx, ddX_safe, tmp, &tmp, axis);
funcs::DefaultElementwiseOperator<Context,
T,
funcs::DivideFunctor<T>,
funcs::InverseDivideFunctor<T>>(
dev_ctx, tmp, y, ddout, axis);
}
if (dout) {
// dOut = - dX * ddY
funcs::DefaultElementwiseOperator<Context,
T,
funcs::MultiplyFunctor<T>,
funcs::InverseMultiplyFunctor<T>>(
dev_ctx, dx, ddY_safe, dout, axis);
auto& place = *dev_ctx.eigen_device();
auto dout_result = phi::EigenVector<T>::Flatten(*dout);
dout_result.device(place) = static_cast<T>(-1) * dout_result;
}
}
} // namespace phi
......@@ -208,6 +208,7 @@ PD_REGISTER_KERNEL(divide,
int,
int64_t,
phi::dtype::float16,
phi::dtype::bfloat16,
complex64,
complex128) {}
PD_REGISTER_KERNEL(multiply,
......
......@@ -106,6 +106,22 @@ KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
"subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
}
KernelSignature ElementwiseDivGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("divide_grad",
{"X", "Y", "Out", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
}
KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("divide_double_grad",
{"Y", "Out", "DX", "DDX", "DDY"},
{"axis"},
{GradVarName("Y"), "DOut", "DDOut"});
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
......@@ -117,6 +133,8 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
phi::ElementwiseAddOpArgumentMapping);
......@@ -136,3 +154,7 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
phi::ElementwiseSubGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad,
phi::ElementwiseSubDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad,
phi::ElementwiseDivGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad,
phi::ElementwiseDivDoubleGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册