未验证 提交 5898e9ab 编写于 作者: Y YuanRisheng 提交者: GitHub

[Phi]Move elementwise function to funcs directory (#39986)

* move elementwise function to funcs directory

* fix compile bugs

* modify according to comment
上级 66196573
......@@ -39,7 +39,7 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#else
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/phi/kernels/gpu/elementwise.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
#endif
namespace ops = paddle::operators;
......
......@@ -16,9 +16,6 @@
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
// only can include the headers in paddle/top/api dirs
#include "paddle/phi/kernels/gpu/elementwise.h"
namespace paddle {
namespace operators {
......
......@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/phi/api/lib/utils/tensor_utils.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/cpu/elementwise_grad.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#ifdef __NVCC__
......@@ -133,7 +134,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
inline framework::DDim trim_trailing_singular_dims(
const framework::DDim &dims) {
return phi::funcs::trim_trailing_singular_dims(dims);
return phi::funcs::TrimTrailingSingularDims(dims);
}
template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
......@@ -152,7 +153,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
Tout>(
dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else {
phi::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
phi::funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
}
}
......@@ -173,19 +174,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
const framework::Tensor *y, int axis, Functor func,
framework::Tensor *z) {
z->mutable_data<OutType>(ctx.GetPlace());
if (platform::is_gpu_place(ctx.GetPlace())) {
#if defined(__NVCC__) || defined(__HIPCC__)
const auto &dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func,
z);
#endif
return;
}
const auto &dev_ctx =
ctx.template device_context<platform::CPUDeviceContext>();
phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func, z);
const auto &dev_ctx = ctx.template device_context<DeviceContext>();
phi::funcs::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis,
func, z);
}
// FusedElemwiseAndAct
......@@ -443,8 +434,8 @@ void FusedElemwiseAndActComputeWithBroadcast(
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post, is_run_common_broadcast;
phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
if (post == 1) {
int h = pre;
int w = n;
......@@ -991,8 +982,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post, is_run_common_broadcast;
phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
const T *x_data = nullptr;
const T *y_data = nullptr;
if (x->IsInitialized()) x_data = x->data<T>();
......
......@@ -19,7 +19,7 @@ limitations under the License. */
// only can include the headers in paddle/top/api dirs
#include "paddle/phi/api/lib/utils/tensor_utils.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
namespace paddle {
namespace operators {
......
......@@ -151,12 +151,12 @@ struct GetInputIndex<false> {
const std::vector<int>& output_strides, int output_idx,
int* index_array, int* lhs_idx, int* rhs_idx) {
int out_dims_size = output_strides.size();
*lhs_idx =
phi::GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array);
*rhs_idx =
phi::GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array);
phi::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
index_array);
*lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size,
index_array);
*rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size,
index_array);
phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
index_array);
}
};
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -189,250 +189,6 @@ struct SameDimsMultiplyFunctor<
}
};
inline void UpdateElementwiseIndexArray(const int* out_dims_array,
const int max_dim,
int* index_array) {
for (int i = max_dim - 1; i >= 0; --i) {
++index_array[i];
if (index_array[i] >= out_dims_array[i]) {
index_array[i] -= out_dims_array[i];
} else {
break;
}
}
}
inline int GetElementwiseIndex(const int* x_dims_array,
const int max_dim,
const int* index_array) {
int index_ = 0;
for (int i = 0; i < max_dim; i++) {
if (x_dims_array[i] > 1) {
index_ = index_ * x_dims_array[i] + index_array[i];
}
}
return index_;
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
void CommonGradBroadcastCPU(const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
DenseTensor* dx,
DenseTensor* dy,
int* x_dims_array,
int* y_dims_array,
int* out_dims_array,
int max_dim,
const CPUContext& ctx,
DX_OP dx_op,
DY_OP dy_op) {
std::vector<int> index_array(max_dim, 0);
const T* x_data = x.data<T>();
const T* y_data = y.data<T>();
const Tout* out_data = out.data<Tout>();
const Tout* dout_data = dout.data<Tout>();
T* dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
T* dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
if (dx_data != nullptr) {
memset(dx_data, 0, dx->numel() * sizeof(T));
}
if (dy_data != nullptr) {
memset(dy_data, 0, dy->numel() * sizeof(T));
}
const int out_size = std::accumulate(
out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
int x_index, y_index;
for (int out_index = 0; out_index < out_size; ++out_index) {
x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
if (dx_data != nullptr) {
dx_data[x_index] += dx_op(x_data[x_index],
y_data[y_index],
out_data[out_index],
dout_data[out_index]);
}
if (dy_data != nullptr) {
dy_data[y_index] += dy_op(x_data[x_index],
y_data[y_index],
out_data[out_index],
dout_data[out_index]);
}
UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
}
}
template <typename Functor, typename T, typename OutType = T>
void CommonForwardBroadcastCPU(const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z,
int* x_dims_array,
int* y_dims_array,
int* out_dims_array,
int max_dim,
const CPUContext& ctx,
Functor func,
const bool is_xsize_larger = true) {
std::vector<int> index_array(max_dim, 0);
const T* x_data = x.data<T>();
const T* y_data = y.data<T>();
PADDLE_ENFORCE_NOT_NULL(
x_data, phi::errors::InvalidArgument("The input X should not be empty."));
PADDLE_ENFORCE_NOT_NULL(
y_data, phi::errors::InvalidArgument("The input Y should not be empty."));
OutType* out_data = ctx.Alloc<OutType>(z);
const int out_size = std::accumulate(
out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
int x_index, y_index;
for (int out_index = 0; out_index < out_size; ++out_index) {
x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
if (is_xsize_larger) {
out_data[out_index] = func(x_data[x_index], y_data[y_index]);
} else {
out_data[out_index] = func(y_data[y_index], x_data[x_index]);
}
UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
}
}
template <typename Functor, typename T, typename OutType = T>
void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z,
const DDim& x_dims,
const DDim& y_dims,
Functor func,
int axis,
const bool is_xsize_larger = true) {
int max_dim = (std::max)(x_dims.size(), y_dims.size());
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
PADDLE_ENFORCE_GE(
axis,
0,
phi::errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(axis,
max_dim,
phi::errors::InvalidArgument(
"Axis should be less than %d, but received axis is %d.",
max_dim,
axis));
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
funcs::GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
CommonForwardBroadcastCPU<Functor, T, OutType>(x,
y,
z,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
dev_ctx,
func,
is_xsize_larger);
}
// It is a common CPU implementation to compute binary calculation with the
// support of broadcast. Note:
// 1. CPU implementation cannot support the case when x needs broadcast, thus
// this function need to be called with XxxFunctor and XxxInverseFunctor,
// like AddFunctor and InverseAddFunctor.
// 2. The corresponding GPU implementation supports all the broadcast cases,
// thus there is no need to define and call with XxxInverseFunctor.
// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
// cases and avoid the need of XxxInverseFunctor.
template <typename Functor, typename T, typename OutType = T>
void ElementwiseCompute(const CPUContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
Functor func,
DenseTensor* z) {
dev_ctx.Alloc<OutType>(z);
auto x_dims = x.dims();
auto y_dims = y.dims();
bool is_xsize_larger = true;
int max_dim = x_dims.size();
if (x_dims.size() < y_dims.size()) {
is_xsize_larger = false;
max_dim = y_dims.size();
}
funcs::TransformFunctor<Functor, T, CPUContext, OutType> functor(
x, y, z, dev_ctx, func, is_xsize_larger);
if (x_dims == y_dims) {
functor.Run();
return;
}
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
PADDLE_ENFORCE_GE(
axis,
0,
phi::errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(axis,
max_dim,
phi::errors::InvalidArgument(
"Axis should be less than %d, but received axis is %d.",
max_dim,
axis));
int pre, n, post, is_run_common_broadcast, axis_trim = 0;
if (is_xsize_larger) {
auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
funcs::get_mid_dims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
} else {
auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
funcs::get_mid_dims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
}
// special case for common implementation.
// case 1: x=[2,3,1,5], y=[2,1,4,1]
// case 2: x=[2,3,4], y=[1,1,4]
if (is_run_common_broadcast == 1) {
CommonElementwiseBroadcastForward<Functor, T, OutType>(
dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
return;
}
if (post == 1) {
functor.RunRowWise(n, pre);
return;
} else {
functor.RunMidWise(n, pre, post);
return;
}
}
template <typename Functor>
struct SameDimsElementwiseCompute {
void operator()(const CPUContext& dev_ctx,
......@@ -443,377 +199,4 @@ struct SameDimsElementwiseCompute {
}
};
// BACKWARD CODE
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
static void ElemwiseGradBroadcast1CPU(const T* x,
const T* y,
const Tout* out,
const Tout* dout,
int h,
int w,
bool is_xsize_larger,
DX_OP dx_op,
DY_OP dy_op,
T* dx,
T* dy) {
if (is_xsize_larger) {
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int x_offset = i * w + j;
if (dx != nullptr) {
dx[x_offset] =
dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
if (dy != nullptr) {
T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
if (i == 0) {
dy[j] = tmp;
} else {
dy[j] += tmp;
}
}
}
}
} else { // x.dims < y.dims, broadcast for x.
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int y_offset = i * w + j;
if (dy != nullptr) {
dy[y_offset] =
dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
}
if (dx != nullptr) {
T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
if (i == 0) {
dx[j] = tmp;
} else {
dx[j] += tmp;
}
}
}
}
}
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
static void ElemwiseGradBroadcast2CPU(const T* x,
const T* y,
const Tout* out,
const Tout* dout,
int pre,
int n,
int post,
bool is_xsize_larger,
DX_OP dx_op,
DY_OP dy_op,
T* dx,
T* dy) {
if (is_xsize_larger) {
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
for (int k = 0; k < post; ++k) {
int x_offset = i * n * post + j * post + k;
if (dx != nullptr) {
dx[x_offset] =
dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
if (dy != nullptr) {
T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
if (i == 0 && k == 0) {
dy[j] = tmp;
} else {
dy[j] += tmp;
}
}
}
}
}
} else { // x.dims < y.dims, broadcast for x.
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
for (int k = 0; k < post; ++k) {
int y_offset = i * n * post + j * post + k;
if (dy != nullptr) {
dy[y_offset] =
dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
}
if (dx != nullptr) {
T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
if (i == 0 && k == 0) {
dx[j] = tmp;
} else {
dx[j] += tmp;
}
}
}
}
}
}
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
void CommonElementwiseBroadcastBackward(const CPUContext& ctx,
const DDim& x_dims,
const DDim& y_dims,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
int axis,
DenseTensor* dx,
DenseTensor* dy,
DX_OP dx_op,
DY_OP dy_op) {
int max_dim = std::max(x_dims.size(), y_dims.size());
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
funcs::GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
// for inplace strategy. memset will make dx and dout clear and get wrong
// result.
if (dx && dx->IsSharedBufferWith(dout)) {
dx->clear();
dx->mutable_data<T>(x_dims, ctx.GetPlace());
}
VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
<< phi::make_ddim(x_dims_array)
<< " ydim:" << phi::make_ddim(y_dims_array);
CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
y,
out,
dout,
dx,
dy,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
ctx,
dx_op,
dy_op);
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
const DDim& x_dims,
const DDim& y_dims,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
int axis,
DenseTensor* dx,
DenseTensor* dy,
DX_OP dx_op,
DY_OP dy_op) {
bool is_xsize_larger = true;
int max_dim = x_dims.size();
if (x_dims.size() < y_dims.size()) {
is_xsize_larger = false;
max_dim = y_dims.size();
}
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
PADDLE_ENFORCE_GE(
axis,
0,
phi::errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(axis,
max_dim,
phi::errors::InvalidArgument(
"Axis should be less than %d, but received axis is %d.",
max_dim,
axis));
int pre, n, post, is_run_common_broadcast, axis_trim = 0;
if (is_xsize_larger) {
auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
funcs::get_mid_dims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
} else {
auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
funcs::get_mid_dims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
}
// special case for common backward implementation.
if (is_run_common_broadcast) {
CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
return;
}
if (post == 1) {
ElemwiseGradBroadcast1CPU(x.data<T>(),
y.data<T>(),
out.data<Tout>(),
dout.data<Tout>(),
pre,
n,
is_xsize_larger,
dx_op,
dy_op,
dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
} else {
ElemwiseGradBroadcast2CPU(x.data<T>(),
y.data<T>(),
out.data<Tout>(),
dout.data<Tout>(),
pre,
n,
post,
is_xsize_larger,
dx_op,
dy_op,
dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
}
}
// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
// explicit gradient can cut off X, Y, Out from gradient op
// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
// elementwise code.
template <typename T, typename DX_OP, typename DY_OP>
void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
int axis,
DenseTensor* dx,
DenseTensor* dy,
DX_OP dx_op,
DY_OP dy_op) {
const DDim& x_dim = x.dims();
const DDim& y_dim = y.dims();
if (x.dims() == y.dims()) {
phi::funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(
dev_ctx,
x_dim,
y_dim,
dout,
dout,
out,
dout,
axis,
dx,
dy,
dx_op,
dy_op);
} else {
ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
x_dim,
y_dim,
dout,
dout,
out,
dout,
axis,
dx,
dy,
dx_op,
dy_op);
}
}
/*
******************************
Add Grad
******************************
*/
template <typename T>
struct IdentityGrad {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value>::type
elementwise_add_grad(const CPUContext& ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
DenseTensor* dx,
DenseTensor* dy,
int axis = -1) {
auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
if (dx) {
blas.VCOPY(
dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
}
if (dy) {
blas.VCOPY(
dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
}
}
template <typename T>
typename std::enable_if<!std::is_floating_point<T>::value>::type
elementwise_add_grad(const CPUContext& ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
DenseTensor* dx,
DenseTensor* dy,
int axis = -1) {
ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
}
/*
******************************
Sub Grad
******************************
*/
template <typename T>
struct SubGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
struct SubGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
};
template <typename T>
void elementwise_sub_grad(const CPUContext& ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
DenseTensor* dx,
DenseTensor* dy,
int axis = -1) {
ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
}
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
namespace phi {
// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
// explicit gradient can cut off X, Y, Out from gradient op
// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
// elementwise code.
template <typename T, typename DX_OP, typename DY_OP>
void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
int axis,
DenseTensor* dx,
DenseTensor* dy,
DX_OP dx_op,
DY_OP dy_op) {
const DDim& x_dim = x.dims();
const DDim& y_dim = y.dims();
if (x.dims() == y.dims()) {
funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(dev_ctx,
x_dim,
y_dim,
dout,
dout,
out,
dout,
axis,
dx,
dy,
dx_op,
dy_op);
} else {
funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
x_dim,
y_dim,
dout,
dout,
out,
dout,
axis,
dx,
dy,
dx_op,
dy_op);
}
}
/*
******************************
Add Grad
******************************
*/
template <typename T>
struct IdentityGrad {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value>::type
ElementwiseAddGrad(const CPUContext& ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
DenseTensor* dx,
DenseTensor* dy,
int axis = -1) {
auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
if (dx) {
blas.VCOPY(
dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
}
if (dy) {
blas.VCOPY(
dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
}
}
template <typename T>
typename std::enable_if<!std::is_floating_point<T>::value>::type
ElementwiseAddGrad(const CPUContext& ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
DenseTensor* dx,
DenseTensor* dy,
int axis = -1) {
ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
}
/*
******************************
Sub Grad
******************************
*/
template <typename T>
struct SubGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
struct SubGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
};
template <typename T>
void ElementwiseSubGrad(const CPUContext& ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out,
const DenseTensor& dout,
DenseTensor* dx,
DenseTensor* dy,
int axis = -1) {
ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
}
} // namespace phi
......@@ -17,7 +17,8 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/cpu/elementwise_grad.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......@@ -33,7 +34,7 @@ void AddGradFunc(const CPUContext& dev_ctx,
DenseTensor* dy,
int axis = -1) {
if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
} else {
ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
dev_ctx,
......@@ -68,15 +69,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::AddDoubleGradImpl<T>(dev_ctx,
y,
ddx,
ddy,
dout,
axis,
ddout,
ElementwiseCompute<funcs::AddFunctor<T>, T>,
ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
}
template <typename T, typename Context>
......@@ -101,7 +94,7 @@ void SubtractGradKernel(const Context& dev_ctx,
DenseTensor* dy) {
// skip out
auto* out = &dout;
elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
}
template <typename T, typename Context>
......@@ -112,15 +105,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::SubtractDoubleGradImpl<T>(
dev_ctx,
y,
ddx,
ddy,
dout,
axis,
ddout,
ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
}
} // namespace phi
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/logical_functor.h"
// See Note [ Why still include the fluid headers? ]
......@@ -24,15 +24,15 @@
namespace phi {
#define DEFINE_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
funcs::Logical##type##Functor<T> binary_func; \
ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
dev_ctx, x, y, -1, binary_func, out); \
#define DEFINE_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
funcs::Logical##type##Functor<T> binary_func; \
funcs::ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
dev_ctx, x, y, -1, binary_func, out); \
}
DEFINE_LOGICAL_BINARY_KERNEL(And)
......
......@@ -20,6 +20,7 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
......@@ -45,10 +46,10 @@ namespace phi {
auto x_dims = x.dims(); \
auto y_dims = y.dims(); \
if (x_dims.size() >= y_dims.size()) { \
ElementwiseCompute<funcs::name##Functor<T>, T>( \
funcs::ElementwiseCompute<funcs::name##Functor<T>, T>( \
dev_ctx, x, y, axis, funcs::name##Functor<T>(), out); \
} else { \
ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>( \
funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>( \
dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out); \
} \
} \
......@@ -93,10 +94,10 @@ void DivideRawKernel(const Context& dev_ctx,
auto x_dims = x.dims();
auto y_dims = y.dims();
if (x_dims.size() >= y_dims.size()) {
ElementwiseCompute<funcs::DivideFunctor<T>, T>(
funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
} else {
ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
}
}
......
......@@ -25,6 +25,8 @@ namespace kps = phi::kps;
namespace phi {
namespace funcs {
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
struct DimensionsTransform {
using DimVector = std::vector<int64_t>;
typedef void (*MergeFunctor)(
......@@ -183,8 +185,6 @@ struct DimensionsTransform {
}
};
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
template <typename T, int VecSize, int Rank, bool IsBoundary = false>
__device__ __forceinline__ void LoadData(
T *dst,
......@@ -578,6 +578,20 @@ void BroadcastKernel(const KPDevice &ctx,
}
}
template <typename Functor, typename T, typename OutType = T>
void ElementwiseCompute(const GPUContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
int axis,
Functor func,
DenseTensor *z) {
std::vector<const DenseTensor *> ins = {&x, &y};
std::vector<DenseTensor *> outs = {z};
z->mutable_data<OutType>(dev_ctx.GetPlace());
BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
dev_ctx, ins, &outs, axis, func);
}
#endif
} // namespace funcs
......
......@@ -18,7 +18,8 @@ limitations under the License. */
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/elementwise_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
......@@ -44,28 +45,6 @@ using ConditionalT =
namespace funcs {
using DDim = phi::DDim;
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
struct ElemwiseGradNoBroadcast {
const T *x_;
const T *y_;
const Tout *out_;
const Tout *dout_;
HOSTDEVICE void operator()(size_t i) {
if (dx_ != nullptr) {
dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
}
if (dy_ != nullptr) {
dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
}
}
DX_OP dx_op_;
DY_OP dy_op_;
T *dx_;
T *dy_;
};
template <typename T, typename DeviceContext>
class RowwiseTransformIterator;
......@@ -293,73 +272,172 @@ class TransformFunctor {
bool is_xsize_larger_;
};
inline DDim trim_trailing_singular_dims(const DDim &dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
}
if (actual_dims_size == dims.size()) return dims;
std::vector<int> trim_dims;
trim_dims.resize(actual_dims_size);
for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i];
}
if (trim_dims.size() == 0) {
return DDim(phi::make_dim());
template <typename Functor, typename T, typename OutType = T>
void CommonForwardBroadcastCPU(const DenseTensor &x,
const DenseTensor &y,
DenseTensor *z,
int *x_dims_array,
int *y_dims_array,
int *out_dims_array,
int max_dim,
const CPUContext &ctx,
Functor func,
const bool is_xsize_larger = true) {
std::vector<int> index_array(max_dim, 0);
const T *x_data = x.data<T>();
const T *y_data = y.data<T>();
PADDLE_ENFORCE_NOT_NULL(
x_data, errors::InvalidArgument("The input X should not be empty."));
PADDLE_ENFORCE_NOT_NULL(
y_data, errors::InvalidArgument("The input Y should not be empty."));
OutType *out_data = ctx.Alloc<OutType>(z);
const int out_size = std::accumulate(
out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
int x_index, y_index;
for (int out_index = 0; out_index < out_size; ++out_index) {
x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
if (is_xsize_larger) {
out_data[out_index] = func(x_data[x_index], y_data[y_index]);
} else {
out_data[out_index] = func(y_data[y_index], x_data[x_index]);
}
UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
}
DDim actual_dims = phi::make_ddim(trim_dims);
return actual_dims;
}
/*
* Out = X ⊙ Y
* If Y's shape does not match X' shape, they will be reshaped.
* For example:
* 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
* pre=2, n=3*4, post=5
* x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
* 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
* pre=2*3, n=4*5, post=1
* x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
*
* New parameter: *is_run_common_broadcast* is a flag to record whether to run
* common broadcast code.
*/
inline void get_mid_dims(const DDim &x_dims,
const DDim &y_dims,
const int axis,
int *pre,
int *n,
int *post,
int *is_run_common_broadcast) {
*pre = 1;
*n = 1;
*post = 1;
*is_run_common_broadcast = 0;
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
if (x_dims[i + axis] != y_dims[i]) {
PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
true,
phi::errors::InvalidArgument(
"Broadcast dimension mismatch. Operands "
"could not be broadcast together with the shape of "
"X = [%s] and the shape of Y = [%s]. Received [%d] "
"in X is not equal to [%d] in Y.",
x_dims,
y_dims,
x_dims[i + axis],
y_dims[i]));
*is_run_common_broadcast = 1;
return;
}
(*n) *= y_dims[i];
}
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
template <typename Functor, typename T, typename OutType = T>
void CommonElementwiseBroadcastForward(const CPUContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
DenseTensor *z,
const DDim &x_dims,
const DDim &y_dims,
Functor func,
int axis,
const bool is_xsize_larger = true) {
int max_dim = (std::max)(x_dims.size(), y_dims.size());
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
PADDLE_ENFORCE_GE(
axis,
0,
phi::errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(axis,
max_dim,
phi::errors::InvalidArgument(
"Axis should be less than %d, but received axis is %d.",
max_dim,
axis));
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
CommonForwardBroadcastCPU<Functor, T, OutType>(x,
y,
z,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
dev_ctx,
func,
is_xsize_larger);
}
// It is a common CPU implementation to compute binary calculation with the
// support of broadcast. Note:
// 1. CPU implementation cannot support the case when x needs broadcast, thus
// this function need to be called with XxxFunctor and XxxInverseFunctor,
// like AddFunctor and InverseAddFunctor.
// 2. The corresponding GPU implementation supports all the broadcast cases,
// thus there is no need to define and call with XxxInverseFunctor.
// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
// cases and avoid the need of XxxInverseFunctor.
template <typename Functor, typename T, typename OutType = T>
void ElementwiseCompute(const CPUContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
int axis,
Functor func,
DenseTensor *z) {
dev_ctx.Alloc<OutType>(z);
auto x_dims = x.dims();
auto y_dims = y.dims();
bool is_xsize_larger = true;
int max_dim = x_dims.size();
if (x_dims.size() < y_dims.size()) {
is_xsize_larger = false;
max_dim = y_dims.size();
}
TransformFunctor<Functor, T, CPUContext, OutType> functor(
x, y, z, dev_ctx, func, is_xsize_larger);
if (x_dims == y_dims) {
functor.Run();
return;
}
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
PADDLE_ENFORCE_GE(
axis,
0,
errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(axis,
max_dim,
errors::InvalidArgument(
"Axis should be less than %d, but received axis is %d.",
max_dim,
axis));
int pre, n, post, is_run_common_broadcast, axis_trim = 0;
if (is_xsize_larger) {
auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
GetMidDims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
} else {
auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
GetMidDims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
}
// special case for common implementation.
// case 1: x=[2,3,1,5], y=[2,1,4,1]
// case 2: x=[2,3,4], y=[1,1,4]
if (is_run_common_broadcast == 1) {
CommonElementwiseBroadcastForward<Functor, T, OutType>(
dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
return;
}
if (post == 1) {
functor.RunRowWise(n, pre);
return;
} else {
functor.RunMidWise(n, pre, post);
return;
}
}
......@@ -395,41 +473,11 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
*ddx_safe = phi::Empty(dev_ctx, std::move(meta));
ddx_safe->mutable_data(dev_ctx.GetPlace());
phi::funcs::SetConstant<DeviceContext, T> set_zero;
SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
}
}
template <typename DeviceContext,
typename T,
typename DX_OP,
typename DY_OP,
typename Tout = T>
void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
const DDim &x_dim,
const DDim &y_dim,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
int axis,
DenseTensor *dx,
DenseTensor *dy,
DX_OP dx_op,
DY_OP dy_op) {
size_t N = static_cast<size_t>(phi::product(x_dim));
phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
x.data<T>(),
y.data<T>(),
out.data<Tout>(),
dout.data<Tout>(),
dx_op,
dy_op,
dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
}
inline void ElementwiseGradPreProcess(const DenseTensor &dout,
DenseTensor *dx) {
if (dx != nullptr) {
......@@ -806,6 +854,7 @@ void ElementwiseKernel(const KPDevice &ctx,
}
}
}
#endif
} // namespace funcs
......
......@@ -14,16 +14,25 @@ limitations under the License. */
#pragma once
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/gpu/reduce.h"
#include "paddle/phi/kernels/funcs/elementwise_utils.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#if defined(__NVCC__) || defined(__HIPCC__)
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#endif
#ifdef __HIPCC__
constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
#else
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif
#define BLOCK_X 32
#define BLOCK_Y 32
......@@ -36,21 +45,361 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
namespace phi {
// General binary elementwise comutaion with the support of broadcast.
template <typename Functor, typename T, typename OutType = T>
void ElementwiseCompute(const GPUContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
int axis,
Functor func,
DenseTensor *z) {
std::vector<const DenseTensor *> ins = {&x, &y};
std::vector<DenseTensor *> outs = {z};
z->mutable_data<OutType>(dev_ctx.GetPlace());
phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, OutType>(
dev_ctx, ins, &outs, axis, func);
namespace funcs {
using DDim = phi::DDim;
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
void CommonGradBroadcastCPU(const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy,
int *x_dims_array,
int *y_dims_array,
int *out_dims_array,
int max_dim,
const CPUContext &ctx,
DX_OP dx_op,
DY_OP dy_op) {
std::vector<int> index_array(max_dim, 0);
const T *x_data = x.data<T>();
const T *y_data = y.data<T>();
const Tout *out_data = out.data<Tout>();
const Tout *dout_data = dout.data<Tout>();
T *dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
T *dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
if (dx_data != nullptr) {
memset(dx_data, 0, dx->numel() * sizeof(T));
}
if (dy_data != nullptr) {
memset(dy_data, 0, dy->numel() * sizeof(T));
}
const int out_size = std::accumulate(
out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
int x_index, y_index;
for (int out_index = 0; out_index < out_size; ++out_index) {
x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
if (dx_data != nullptr) {
dx_data[x_index] += dx_op(x_data[x_index],
y_data[y_index],
out_data[out_index],
dout_data[out_index]);
}
if (dy_data != nullptr) {
dy_data[y_index] += dy_op(x_data[x_index],
y_data[y_index],
out_data[out_index],
dout_data[out_index]);
}
UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
}
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
static void ElemwiseGradBroadcast1CPU(const T *x,
const T *y,
const Tout *out,
const Tout *dout,
int h,
int w,
bool is_xsize_larger,
DX_OP dx_op,
DY_OP dy_op,
T *dx,
T *dy) {
if (is_xsize_larger) {
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int x_offset = i * w + j;
if (dx != nullptr) {
dx[x_offset] =
dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
if (dy != nullptr) {
T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
if (i == 0) {
dy[j] = tmp;
} else {
dy[j] += tmp;
}
}
}
}
} else { // x.dims < y.dims, broadcast for x.
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int y_offset = i * w + j;
if (dy != nullptr) {
dy[y_offset] =
dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
}
if (dx != nullptr) {
T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
if (i == 0) {
dx[j] = tmp;
} else {
dx[j] += tmp;
}
}
}
}
}
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
static void ElemwiseGradBroadcast2CPU(const T *x,
const T *y,
const Tout *out,
const Tout *dout,
int pre,
int n,
int post,
bool is_xsize_larger,
DX_OP dx_op,
DY_OP dy_op,
T *dx,
T *dy) {
if (is_xsize_larger) {
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
for (int k = 0; k < post; ++k) {
int x_offset = i * n * post + j * post + k;
if (dx != nullptr) {
dx[x_offset] =
dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
if (dy != nullptr) {
T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
if (i == 0 && k == 0) {
dy[j] = tmp;
} else {
dy[j] += tmp;
}
}
}
}
}
} else { // x.dims < y.dims, broadcast for x.
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
for (int k = 0; k < post; ++k) {
int y_offset = i * n * post + j * post + k;
if (dy != nullptr) {
dy[y_offset] =
dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
}
if (dx != nullptr) {
T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
if (i == 0 && k == 0) {
dx[j] = tmp;
} else {
dx[j] += tmp;
}
}
}
}
}
}
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
void CommonElementwiseBroadcastBackward(const CPUContext &ctx,
const DDim &x_dims,
const DDim &y_dims,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
int axis,
DenseTensor *dx,
DenseTensor *dy,
DX_OP dx_op,
DY_OP dy_op) {
int max_dim = std::max(x_dims.size(), y_dims.size());
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
// for inplace strategy. memset will make dx and dout clear and get wrong
// result.
if (dx && dx->IsSharedBufferWith(dout)) {
dx->clear();
dx->mutable_data<T>(x_dims, ctx.GetPlace());
}
VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
<< phi::make_ddim(x_dims_array)
<< " ydim:" << phi::make_ddim(y_dims_array);
CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
y,
out,
dout,
dx,
dy,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
ctx,
dx_op,
dy_op);
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
void ElemwiseGradComputeWithBroadcast(const CPUContext &ctx,
const DDim &x_dims,
const DDim &y_dims,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
int axis,
DenseTensor *dx,
DenseTensor *dy,
DX_OP dx_op,
DY_OP dy_op) {
bool is_xsize_larger = true;
int max_dim = x_dims.size();
if (x_dims.size() < y_dims.size()) {
is_xsize_larger = false;
max_dim = y_dims.size();
}
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
PADDLE_ENFORCE_GE(
axis,
0,
errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(axis,
max_dim,
errors::InvalidArgument(
"Axis should be less than %d, but received axis is %d.",
max_dim,
axis));
int pre, n, post, is_run_common_broadcast, axis_trim = 0;
if (is_xsize_larger) {
auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
GetMidDims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
} else {
auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
GetMidDims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
}
// special case for common backward implementation.
if (is_run_common_broadcast) {
CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
return;
}
if (post == 1) {
ElemwiseGradBroadcast1CPU(x.data<T>(),
y.data<T>(),
out.data<Tout>(),
dout.data<Tout>(),
pre,
n,
is_xsize_larger,
dx_op,
dy_op,
dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
} else {
ElemwiseGradBroadcast2CPU(x.data<T>(),
y.data<T>(),
out.data<Tout>(),
dout.data<Tout>(),
pre,
n,
post,
is_xsize_larger,
dx_op,
dy_op,
dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
}
}
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
struct ElemwiseGradNoBroadcast {
const T *x_;
const T *y_;
const Tout *out_;
const Tout *dout_;
HOSTDEVICE void operator()(size_t i) {
if (dx_ != nullptr) {
dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
}
if (dy_ != nullptr) {
dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
}
}
DX_OP dx_op_;
DY_OP dy_op_;
T *dx_;
T *dy_;
};
template <typename DeviceContext,
typename T,
typename DX_OP,
typename DY_OP,
typename Tout = T>
void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
const DDim &x_dim,
const DDim &y_dim,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
int axis,
DenseTensor *dx,
DenseTensor *dy,
DX_OP dx_op,
DY_OP dy_op) {
size_t N = static_cast<size_t>(phi::product(x_dim));
phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
x.data<T>(),
y.data<T>(),
out.data<Tout>(),
dout.data<Tout>(),
dx_op,
dy_op,
dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
}
#if defined(__NVCC__) || defined(__HIPCC__)
// Suppose only has contiguous dims
static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
for (int i = 1; i < broadcast_pos.size(); ++i) {
......@@ -114,7 +463,6 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array,
}
}
#ifndef __xpu__
template <typename T, typename OP, typename Tout = T>
static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
const T *y,
......@@ -1282,13 +1630,13 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
funcs::GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
// for inplace strategy. memset will make dx and dout clear and get wrong
// result.
if (dx && dx->IsSharedBufferWith(dout)) {
......@@ -1340,37 +1688,37 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
PADDLE_ENFORCE_GE(
axis,
0,
phi::errors::InvalidArgument(
errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(axis,
max_dim,
phi::errors::InvalidArgument(
errors::InvalidArgument(
"Axis should be less than %d, but received axis is %d.",
max_dim,
axis));
int pre, n, post, is_run_common_broadcast, axis_trim = 0;
if (is_xsize_larger) {
auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
funcs::get_mid_dims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
GetMidDims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
} else {
auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
funcs::get_mid_dims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
GetMidDims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
}
// special case for common backward implementation.
if (is_run_common_broadcast) {
......@@ -1408,228 +1756,7 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
}
}
/*
******************************
Add Grad
******************************
*/
template <typename T>
static __global__ void SimpleElemwiseAddGradCUDAKernel(
const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
int stride = GRID_NUM_X * BLOCK_NUM_X;
int loop = size / vec_size;
int remainder = size % vec_size;
const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
float4 *dx_vec = reinterpret_cast<float4 *>(dx);
float4 *dy_vec = reinterpret_cast<float4 *>(dy);
float4 tmp_loop;
for (int i = tid; i < loop; i += stride) {
tmp_loop = dout_vec[i];
dx_vec[i] = tmp_loop;
dy_vec[i] = tmp_loop;
}
if (tid == loop && remainder != 0) {
T tmp_rem;
while (remainder) {
int idx = size - remainder;
remainder--;
tmp_rem = dout[idx];
dx[idx] = tmp_rem;
dy[idx] = tmp_rem;
}
}
}
template <typename T>
void default_elementwise_add_grad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy,
int axis = -1) {
auto *dout_data = dout.data<T>();
// dx
if (dx != nullptr) {
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) {
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
}
} else {
// For inplace strategy, dx will be stored in addr of dout, which makes
// the result of dy wrong.
if (dx->IsSharedBufferWith(dout)) {
dx->clear();
dx->mutable_data<T>(x.dims(), ctx.GetPlace());
}
std::vector<int> reduce_dims =
funcs::GetReduceDim(x.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
// dy
if (dy != nullptr) {
auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout.dims()) {
if (dy_data != dout_data) {
phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
}
} else {
std::vector<int> reduce_dims =
funcs::GetReduceDim(y.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
}
template <typename T>
void elementwise_add_grad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy) {
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
auto *dout_data = dout.data<T>();
if (dx_data == dout_data && dy_data != dout_data) {
VLOG(4) << "Special case when dx_data is the same as dout_data, "
"only need copy dout to dy";
phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
} else if (dx_data != dout_data && dy_data == dout_data) {
VLOG(4) << "Special case when dy_data is the same as dout_data, "
"only need copy dout to dx";
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
} else if (dx_data != dout_data && dy_data != dout_data) {
auto size = x.numel();
int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
dim3 grid_size =
dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
PREDEFINED_BLOCK_SIZE,
1);
SimpleElemwiseAddGradCUDAKernel<
T><<<grid_size, block_size, 0, ctx.stream()>>>(
dout.data<T>(),
size,
vec_size,
dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
} else {
VLOG(4) << "Special case when dy_data is the same as dout_data, "
"and dx_data is the same as dout_data, do not need "
"any operator";
}
}
/*
******************************
Sub Grad
******************************
*/
template <typename T>
static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
int64_t size,
T *dx,
T *dy) {
int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
while (col < size) {
if (dx != nullptr) {
dx[col] = dout[col];
}
dy[col] = -dout[col];
col += BLOCK_NUM_X * GRID_NUM_X;
}
}
template <typename T>
void default_elementwise_sub_grad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy,
int axis = -1) {
auto *dout_data = dout.data<T>();
// dx
if (dx != nullptr) {
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) {
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
}
} else {
// For inplace strategy, dx will be stored in addr of dout, which makes
// the result of dy wrong.
if (dx->IsSharedBufferWith(dout)) {
dx->clear();
dx->mutable_data<T>(x.dims(), ctx.GetPlace());
}
std::vector<int> reduce_dims =
funcs::GetReduceDim(x.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
// dy
if (dy != nullptr) {
auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout.dims()) {
if (dy_data != dout_data) {
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = dy->numel();
dim3 grid_size =
dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<
T><<<grid_size, block_size, 0, ctx.stream()>>>(
dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
}
} else {
std::vector<int> reduce_dims =
funcs::GetReduceDim(y.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
}
}
}
template <typename T>
void elementwise_sub_grad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy) {
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = x.numel();
dim3 grid_size =
dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<
T><<<grid_size, block_size, 0, ctx.stream()>>>(
dout.data<T>(),
size,
dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
}
#endif
} // namespace funcs
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
namespace phi {
namespace funcs {
using DDim = phi::DDim;
/*
* Out = X ⊙ Y
* If Y's shape does not match X' shape, they will be reshaped.
* For example:
* 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
* pre=2, n=3*4, post=5
* x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
* 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
* pre=2*3, n=4*5, post=1
* x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
*
* New parameter: *is_run_common_broadcast* is a flag to record whether to run
* common broadcast code.
*/
inline void GetMidDims(const DDim &x_dims,
const DDim &y_dims,
const int axis,
int *pre,
int *n,
int *post,
int *is_run_common_broadcast) {
*pre = 1;
*n = 1;
*post = 1;
*is_run_common_broadcast = 0;
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
if (x_dims[i + axis] != y_dims[i]) {
PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
true,
phi::errors::InvalidArgument(
"Broadcast dimension mismatch. Operands "
"could not be broadcast together with the shape of "
"X = [%s] and the shape of Y = [%s]. Received [%d] "
"in X is not equal to [%d] in Y.",
x_dims,
y_dims,
x_dims[i + axis],
y_dims[i]));
*is_run_common_broadcast = 1;
return;
}
(*n) *= y_dims[i];
}
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
}
inline DDim TrimTrailingSingularDims(const DDim &dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
}
if (actual_dims_size == dims.size()) return dims;
std::vector<int> trim_dims;
trim_dims.resize(actual_dims_size);
for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i];
}
if (trim_dims.size() == 0) {
return DDim(phi::make_dim());
}
DDim actual_dims = phi::make_ddim(trim_dims);
return actual_dims;
}
inline int GetElementwiseIndex(const int *x_dims_array,
const int max_dim,
const int *index_array) {
int index_ = 0;
for (int i = 0; i < max_dim; i++) {
if (x_dims_array[i] > 1) {
index_ = index_ * x_dims_array[i] + index_array[i];
}
}
return index_;
}
inline void UpdateElementwiseIndexArray(const int *out_dims_array,
const int max_dim,
int *index_array) {
for (int i = max_dim - 1; i >= 0; --i) {
++index_array[i];
if (index_array[i] >= out_dims_array[i]) {
index_array[i] -= out_dims_array[i];
} else {
break;
}
}
}
} // namespace funcs
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
#include "paddle/phi/kernels/gpu/reduce.h"
namespace phi {
/*
******************************
Add Grad
******************************
*/
template <typename T>
static __global__ void SimpleElemwiseAddGradCUDAKernel(
const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
int stride = GRID_NUM_X * BLOCK_NUM_X;
int loop = size / vec_size;
int remainder = size % vec_size;
const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
float4 *dx_vec = reinterpret_cast<float4 *>(dx);
float4 *dy_vec = reinterpret_cast<float4 *>(dy);
float4 tmp_loop;
for (int i = tid; i < loop; i += stride) {
tmp_loop = dout_vec[i];
dx_vec[i] = tmp_loop;
dy_vec[i] = tmp_loop;
}
if (tid == loop && remainder != 0) {
T tmp_rem;
while (remainder) {
int idx = size - remainder;
remainder--;
tmp_rem = dout[idx];
dx[idx] = tmp_rem;
dy[idx] = tmp_rem;
}
}
}
template <typename T>
void DefaultElementwiseAddGrad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy,
int axis = -1) {
auto *dout_data = dout.data<T>();
// dx
if (dx != nullptr) {
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) {
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
}
} else {
// For inplace strategy, dx will be stored in addr of dout, which makes
// the result of dy wrong.
if (dx->IsSharedBufferWith(dout)) {
dx->clear();
dx->mutable_data<T>(x.dims(), ctx.GetPlace());
}
std::vector<int> reduce_dims =
funcs::GetReduceDim(x.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
// dy
if (dy != nullptr) {
auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout.dims()) {
if (dy_data != dout_data) {
phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
}
} else {
std::vector<int> reduce_dims =
funcs::GetReduceDim(y.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
}
template <typename T>
void ElementwiseAddGrad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy) {
ctx.template Alloc<T>(dx);
ctx.template Alloc<T>(dy);
auto *dx_data = dx->data<T>();
auto *dy_data = dy->data<T>();
auto *dout_data = dout.data<T>();
if (dx_data == dout_data && dy_data != dout_data) {
VLOG(4) << "Special case when dx_data is the same as dout_data, "
"only need copy dout to dy";
phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
} else if (dx_data != dout_data && dy_data == dout_data) {
VLOG(4) << "Special case when dy_data is the same as dout_data, "
"only need copy dout to dx";
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
} else if (dx_data != dout_data && dy_data != dout_data) {
auto size = x.numel();
int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
dim3 grid_size =
dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
PREDEFINED_BLOCK_SIZE,
1);
SimpleElemwiseAddGradCUDAKernel<
T><<<grid_size, block_size, 0, ctx.stream()>>>(
dout.data<T>(),
size,
vec_size,
dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
} else {
VLOG(4) << "Special case when dy_data is the same as dout_data, "
"and dx_data is the same as dout_data, do not need "
"any operator";
}
}
/*
******************************
Sub Grad
******************************
*/
template <typename T>
static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
int64_t size,
T *dx,
T *dy) {
int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
while (col < size) {
if (dx != nullptr) {
dx[col] = dout[col];
}
dy[col] = -dout[col];
col += BLOCK_NUM_X * GRID_NUM_X;
}
}
template <typename T>
void default_elementwise_sub_grad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy,
int axis = -1) {
auto *dout_data = dout.data<T>();
// dx
if (dx != nullptr) {
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) {
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
}
} else {
// For inplace strategy, dx will be stored in addr of dout, which makes
// the result of dy wrong.
if (dx->IsSharedBufferWith(dout)) {
dx->clear();
dx->mutable_data<T>(x.dims(), ctx.GetPlace());
}
std::vector<int> reduce_dims =
funcs::GetReduceDim(x.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
// dy
if (dy != nullptr) {
auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout.dims()) {
if (dy_data != dout_data) {
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = dy->numel();
dim3 grid_size =
dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<
T><<<grid_size, block_size, 0, ctx.stream()>>>(
dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
}
} else {
std::vector<int> reduce_dims =
funcs::GetReduceDim(y.dims(), out.dims(), axis);
gpuStream_t stream = ctx.stream();
kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
}
}
}
template <typename T>
void elementwise_sub_grad(const GPUContext &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &out,
const DenseTensor &dout,
DenseTensor *dx,
DenseTensor *dy) {
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = x.numel();
dim3 grid_size =
dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<
T><<<grid_size, block_size, 0, ctx.stream()>>>(
dout.data<T>(),
size,
dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
}
} // namespace phi
......@@ -17,8 +17,9 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/gpu/elementwise.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
namespace phi {
......@@ -33,9 +34,9 @@ void AddGradFunc(const GPUContext& dev_ctx,
DenseTensor* dy,
int axis = -1) {
if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
} else {
default_elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
}
}
......@@ -58,15 +59,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::AddDoubleGradImpl<T>(dev_ctx,
y,
ddx,
ddy,
dout,
axis,
ddout,
ElementwiseCompute<funcs::AddFunctor<T>, T>,
ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
}
template <typename T, typename Context>
......@@ -106,15 +99,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::SubtractDoubleGradImpl<T>(
dev_ctx,
y,
ddx,
ddy,
dout,
axis,
ddout,
ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
}
} // namespace phi
......
......@@ -16,9 +16,8 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/logical_functor.h"
#include "paddle/phi/kernels/gpu/elementwise.h"
namespace phi {
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/gpu/elementwise.h"
#include "paddle/phi/kernels/gpu/reduce.h"
#ifdef __NVCC__
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
namespace phi {
......@@ -47,19 +47,14 @@ void AddGradImpl(const Context& dev_ctx,
}
}
template <typename T,
typename Context,
typename GradFunc,
typename GradInverseFunc>
template <typename T, typename Context>
void AddDoubleGradImpl(const Context& dev_ctx,
const DenseTensor& y,
const paddle::optional<const DenseTensor&>& ddx,
const paddle::optional<const DenseTensor&>& ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout,
GradFunc grad_func,
GradInverseFunc grad_inverse_func) {
DenseTensor* ddout) {
// ddOut = ddx + ddy
if (ddout) {
DenseTensor ddx_safe, ddy_safe;
......@@ -72,28 +67,28 @@ void AddDoubleGradImpl(const Context& dev_ctx,
auto ddx_dims = ddx_safe.dims();
auto ddy_dims = ddy_safe.dims();
if (ddx_dims.size() >= ddy_dims.size()) {
grad_func(
funcs::ElementwiseCompute<funcs::AddFunctor<T>, T>(
dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor<T>(), ddout);
} else {
grad_inverse_func(dev_ctx,
ddx_safe,
ddy_safe,
axis,
funcs::InverseAddFunctor<T>(),
ddout);
funcs::ElementwiseCompute<funcs::InverseAddFunctor<T>, T>(
dev_ctx,
ddx_safe,
ddy_safe,
axis,
funcs::InverseAddFunctor<T>(),
ddout);
}
}
}
template <typename T, typename Context, typename GradFunc>
template <typename T, typename Context>
void SubtractDoubleGradImpl(const Context& dev_ctx,
const DenseTensor& y,
const paddle::optional<const DenseTensor&>& ddx,
const paddle::optional<const DenseTensor&>& ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout,
GradFunc grad_func) {
DenseTensor* ddout) {
// DDOut = ddx - ddy
if (ddout) {
DenseTensor ddx_safe, ddy_safe;
......@@ -103,7 +98,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
dev_ctx, y, ddy.get_ptr(), &ddy_safe);
ddout->mutable_data<T>(dev_ctx.GetPlace());
grad_func(
funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor<T>(), ddout);
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册