未验证 提交 9b48199a 编写于 作者: N niuliling123 提交者: GitHub

modified reduce_all_op reduce_any_op for higher performance (#33267)

上级 4c352033
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
// reduce_prod
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_all, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool, reduce_all,
ops::AllFunctor>); ops::ReduceCudaKernel<bool, paddle::operators::CustomLogicalAnd>);
...@@ -13,7 +13,10 @@ ...@@ -13,7 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
// reduce_prod
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_any, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool, reduce_any,
ops::AnyFunctor>); ops::ReduceCudaKernel<bool, paddle::operators::CustomLogicalOr>);
...@@ -62,27 +62,6 @@ struct DivideFunctor { ...@@ -62,27 +62,6 @@ struct DivideFunctor {
T n_inv; T n_inv;
}; };
static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
int dim_size, bool reduce_all) {
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(dim_size);
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_dims[i] = i;
}
} else {
for (auto e : dims) {
PADDLE_ENFORCE_LT(e, dim_size,
paddle::platform::errors::InvalidArgument(
"ReduceOp: invalid axis, when x_dims is %d, "
"axis[i] should less than x_dims, but got %d.",
dim_size, e));
reduce_dims.push_back(e >= 0 ? e : e + dim_size);
}
}
return reduce_dims;
}
static inline int GetLastPow2(int n) { static inline int GetLastPow2(int n) {
n |= (n >> 1); n |= (n >> 1);
n |= (n >> 2); n |= (n >> 2);
...@@ -167,8 +146,9 @@ enum ReduceType { ...@@ -167,8 +146,9 @@ enum ReduceType {
// reduce config // reduce config
template <typename Ty> template <typename Ty>
struct ReduceConfig { struct ReduceConfig {
ReduceConfig(std::vector<int> origin_reduce_dims, std::vector<int> x_dim) ReduceConfig(const std::vector<int>& origin_reduce_dims,
: reduce_dims_origin(origin_reduce_dims), x_dim(x_dim) {} const std::vector<int>& origin_x_dim)
: reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
// get the parameters of reduceKernel // get the parameters of reduceKernel
void Run() { void Run() {
...@@ -530,22 +510,22 @@ __device__ __forceinline__ void ReduceAny( ...@@ -530,22 +510,22 @@ __device__ __forceinline__ void ReduceAny(
// module function designed for global function // module function designed for global function
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp, template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim, int Rank, int ReduceRank, int ReduceType> int BlockDim, int Rank, int ReduceRank>
__device__ __forceinline__ void ReduceModule( __device__ __forceinline__ void ReduceModule(
const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init, const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
int reduce_num, int left_num, int blocking_size, int reduce_num, int left_num, int blocking_size, int reduce_type,
paddle::framework::Array<int, Rank> x_strides, paddle::framework::Array<int, Rank> x_strides,
paddle::framework::Array<int, ReduceRank> reduce_dim, paddle::framework::Array<int, ReduceRank> reduce_dim,
paddle::framework::Array<int, ReduceRank> reduce_strides, paddle::framework::Array<int, ReduceRank> reduce_strides,
paddle::framework::Array<int, Rank - ReduceRank> left_dim, paddle::framework::Array<int, Rank - ReduceRank> left_dim,
paddle::framework::Array<int, Rank - ReduceRank> left_strides) { paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
// reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1 // reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1
if (ReduceType == ReduceType::kReduceLastDim) { if (reduce_type == ReduceType::kReduceLastDim) {
ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>( ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>(
x, y, reducer, transformer, init, reduce_num); x, y, reducer, transformer, init, reduce_num);
// reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1 // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
} else if (ReduceType == ReduceType::kReduceHigherDim) { } else if (reduce_type == ReduceType::kReduceHigherDim) {
ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>( ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
...@@ -558,57 +538,47 @@ __device__ __forceinline__ void ReduceModule( ...@@ -558,57 +538,47 @@ __device__ __forceinline__ void ReduceModule(
} }
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp, template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
int BlockDim, int Rank, int ReduceRank, int ReduceType> int BlockDim, int Rank, int ReduceRank>
__global__ void ReduceKernelFunction( __global__ void ReduceKernelFunction(
const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init, const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
int reduce_num, int left_num, int block_size, int reduce_num, int left_num, int block_size, int reduce_type,
paddle::framework::Array<int, Rank> x_strides, paddle::framework::Array<int, Rank> x_strides,
paddle::framework::Array<int, ReduceRank> reduce_dim, paddle::framework::Array<int, ReduceRank> reduce_dim,
paddle::framework::Array<int, ReduceRank> reduce_strides, paddle::framework::Array<int, ReduceRank> reduce_strides,
paddle::framework::Array<int, Rank - ReduceRank> left_dim, paddle::framework::Array<int, Rank - ReduceRank> left_dim,
paddle::framework::Array<int, Rank - ReduceRank> left_strides) { paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
ReduceModule<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank, ReduceModule<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank>(
ReduceType>(x, y, reducer, transformer, init, reduce_num, x, y, reducer, transformer, init, reduce_num, left_num, block_size,
left_num, block_size, x_strides, reduce_dim, reduce_type, x_strides, reduce_dim, reduce_strides, left_dim,
reduce_strides, left_dim, left_strides); left_strides);
} }
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp, template <typename Tx, typename Ty, int BlockDim, typename ReduceOp, int kRank,
typename TransformOp, int kRank, int kReduceRank> int kReduceRank>
static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
const TransformOp& transformer, Ty init, const ReduceOp& reducer, Ty init,
gpuStream_t stream, ReduceConfig<Ty> config) { gpuStream_t stream, ReduceConfig<Ty> config) {
#define CUB_REDUCE_TYPE_CASE(type) \ using TransformOp = typename ReduceOp::Transformer;
case type: { \
constexpr auto kReduceType = type; \
ReduceKernelFunction< \
Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank, \
kReduceType><<<config.grid, config.block, 0, stream>>>( \
x_data, config.output_data, reducer, transformer, init, \
config.reduce_num, config.left_num, config.blocking_size, \
detail::VectorToArray<int, kRank>(config.x_strides), \
detail::VectorToArray<int, kReduceRank>(config.reduce_dim), \
detail::VectorToArray<int, kReduceRank>(config.reduce_strides), \
detail::VectorToArray<int, kRank - kReduceRank>(config.left_dim), \
detail::VectorToArray<int, kRank - kReduceRank>(config.left_strides)); \
} break
switch (config.reduce_type) { ReduceKernelFunction<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank,
CUB_REDUCE_TYPE_CASE(1); // reduceLastDim kReduceRank><<<config.grid, config.block, 0, stream>>>(
CUB_REDUCE_TYPE_CASE(2); // ReduceHigherDim x_data, config.output_data, reducer, TransformOp(config.reduce_num), init,
CUB_REDUCE_TYPE_CASE(3); // reduceAny config.reduce_num, config.left_num, config.blocking_size,
} config.reduce_type, detail::VectorToArray<int, kRank>(config.x_strides),
detail::VectorToArray<int, kReduceRank>(config.reduce_dim),
detail::VectorToArray<int, kReduceRank>(config.reduce_strides),
detail::VectorToArray<int, kRank - kReduceRank>(config.left_dim),
detail::VectorToArray<int, kRank - kReduceRank>(config.left_strides));
if (config.should_reduce_again) { if (config.should_reduce_again) {
dim3 block(config.block.x, 1, 1); dim3 block(config.block.x, 1, 1);
dim3 grid(config.grid.x, 1, config.grid.z); dim3 grid(config.grid.x, 1, config.grid.z);
ReduceKernelFunction< ReduceKernelFunction<Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128,
Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank, kRank, kReduceRank><<<grid, block, 0, stream>>>(
ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
config.output_data, y_data, reducer, config.output_data, y_data, reducer,
detail::IdentityFunctor<Ty>(config.grid.y), init, config.grid.y, detail::IdentityFunctor<Ty>(config.grid.y), init, config.grid.y,
config.left_num, config.grid.y, config.left_num, config.grid.y, ReduceType::kReduceHigherDim,
detail::VectorToArray<int, kRank>(config.x_strides), detail::VectorToArray<int, kRank>(config.x_strides),
detail::VectorToArray<int, kReduceRank>(config.reduce_dim), detail::VectorToArray<int, kReduceRank>(config.reduce_dim),
detail::VectorToArray<int, kReduceRank>(config.reduce_strides), detail::VectorToArray<int, kReduceRank>(config.reduce_strides),
...@@ -617,11 +587,9 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, ...@@ -617,11 +587,9 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
} }
} }
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp, template <typename Tx, typename Ty, int BlockDim, typename ReduceOp>
typename TransformOp> static void ReduceKernelImpl(const Tx* x_data, Ty* y_data,
static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, Ty init,
const ReduceOp& reducer,
const TransformOp& transformer, Ty init,
gpuStream_t stream, ReduceConfig<Ty> config) { gpuStream_t stream, ReduceConfig<Ty> config) {
int reduce_rank = config.reduce_strides.size(); int reduce_rank = config.reduce_strides.size();
int rank = config.x_strides.size(); int rank = config.x_strides.size();
...@@ -635,8 +603,8 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, ...@@ -635,8 +603,8 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
#define CUB_REDUCE_RANK_CASE(i, ...) \ #define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \ case i: { \
constexpr auto kReduceRank = i; \ constexpr auto kReduceRank = i; \
LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \ LaunchReduceKernel<Tx, Ty, BlockDim, ReduceOp, kRank, kReduceRank>( \
x_data, y_data, reducer, transformer, init, stream, config); \ x_data, y_data, reducer, init, stream, config); \
} break } break
detail::CheckReduceRank(reduce_rank, rank); detail::CheckReduceRank(reduce_rank, rank);
...@@ -671,15 +639,13 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, ...@@ -671,15 +639,13 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim); auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
config.Run(); // get the parameters of LaunchReduceKernel config.Run(); // get the parameters of LaunchReduceKernel
auto x_data = x.data<Tx>();
auto y_data = y->mutable_data<Ty>(x.place());
// after config.run() // after config.run()
// SetOutputData for ReduceHigherDim when should_reduce_again is true, // SetOutputData for ReduceHigherDim when should_reduce_again is true,
// temp_output should be stored temp_data in output_data space or stored in // temp_output should be stored temp_data in output_data space or stored in
// y_data; // y_data;
framework::Tensor tmp; framework::Tensor tmp;
config.SetOutputData(y_data, x.place(), &tmp); auto x_data = x.data<Tx>();
auto y_data = y->mutable_data<Ty>(x.place());
if (config.reduce_num == 1) { if (config.reduce_num == 1) {
auto out_dims = y->dims(); auto out_dims = y->dims();
...@@ -687,6 +653,9 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, ...@@ -687,6 +653,9 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
y->Resize(out_dims); y->Resize(out_dims);
return; return;
} }
config.SetOutputData(y_data, x.place(), &tmp);
using TransformOp = typename ReduceOp<Tx, Ty>::Transformer; using TransformOp = typename ReduceOp<Tx, Ty>::Transformer;
auto reducer = ReduceOp<Tx, Ty>(); auto reducer = ReduceOp<Tx, Ty>();
// launch CUB::Reduce // launch CUB::Reduce
...@@ -711,9 +680,8 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, ...@@ -711,9 +680,8 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
#define CUB_BLOCK_DIM_CASE(block_dim) \ #define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \ case block_dim: { \
constexpr auto kBlockDim = block_dim; \ constexpr auto kBlockDim = block_dim; \
LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp<Tx, Ty>, TransformOp>( \ ReduceKernelImpl<Tx, Ty, block_dim, ReduceOp<Tx, Ty>>( \
x_data, y_data, reducer, TransformOp(config.reduce_num), \ x_data, y_data, reducer, reducer.initial(), stream, config); \
reducer.initial(), stream, config); \
} break } break
switch (detail::GetBlockDim(config.reduce_num)) { switch (detail::GetBlockDim(config.reduce_num)) {
...@@ -745,30 +713,5 @@ struct TensorReduceFunc { ...@@ -745,30 +713,5 @@ struct TensorReduceFunc {
} }
}; };
template <typename T, template <typename, typename> class ReduceOp>
class ReduceCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
const Tensor* input = context.Input<Tensor>("X");
Tensor* output = context.Output<Tensor>("Out");
auto out_dtype = context.Attr<int>("out_dtype");
std::vector<int> dims = context.Attr<std::vector<int>>("dim");
std::vector<int> reduce_dims =
detail::GetReduceDim(dims, input->dims().size(), reduce_all);
gpuStream_t stream = context.cuda_device_context().stream();
if (out_dtype >= 0) {
framework::VisitDataTypeSmall(
static_cast<framework::proto::VarType::Type>(out_dtype),
TensorReduceFunc<T, ReduceOp>(*input, output, reduce_dims, stream));
} else {
TensorReduceFunctorImpl<T, T, ReduceOp>(*input, output, reduce_dims,
stream);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -23,6 +23,9 @@ limitations under the License. */ ...@@ -23,6 +23,9 @@ limitations under the License. */
#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
#if defined(__HIPCC__) || defined(__NVCC__)
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -60,6 +63,27 @@ inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims, ...@@ -60,6 +63,27 @@ inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
} }
} }
static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
int dim_size, bool reduce_all) {
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(dim_size);
int reduce_size = reduce_dims.size();
for (int i = 0; i < reduce_size; ++i) {
reduce_dims[i] = i;
}
} else {
for (auto e : dims) {
PADDLE_ENFORCE_LT(e, dim_size,
paddle::platform::errors::InvalidArgument(
"ReduceOp: invalid axis, when x_dims is %d, "
"axis[i] should less than x_dims, but got %d.",
dim_size, e));
reduce_dims.push_back(e >= 0 ? e : e + dim_size);
}
}
return reduce_dims;
}
template <typename DeviceContext, typename OutT> template <typename DeviceContext, typename OutT>
void GetShuffledInput(const framework::ExecutionContext& context, void GetShuffledInput(const framework::ExecutionContext& context,
const Tensor* input, Tensor* shuffled_input, const Tensor* input, Tensor* shuffled_input,
...@@ -308,6 +332,7 @@ class BoolReduceKernel : public framework::OpKernel<OutT> { ...@@ -308,6 +332,7 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
} }
} }
}; };
template <typename DeviceContext, typename T, typename Functor, template <typename DeviceContext, typename T, typename Functor,
bool kNoNeedBufferX = false, bool kNoNeedBufferY = false> bool kNoNeedBufferX = false, bool kNoNeedBufferY = false>
class ReduceGradKernel : public framework::OpKernel<T> { class ReduceGradKernel : public framework::OpKernel<T> {
...@@ -636,6 +661,33 @@ If reduce_all is true, just reduce along all dimensions and output a scalar. ...@@ -636,6 +661,33 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
virtual std::string GetOpType() const = 0; virtual std::string GetOpType() const = 0;
}; };
#if defined(__HIPCC__) || defined(__NVCC__)
template <typename T, template <typename, typename> class ReduceOp>
class ReduceCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
const Tensor* input = context.Input<Tensor>("X");
Tensor* output = context.Output<Tensor>("Out");
auto out_dtype = context.Attr<int>("out_dtype");
std::vector<int> dims = context.Attr<std::vector<int>>("dim");
std::vector<int> reduce_dims =
GetReduceDim(dims, input->dims().size(), reduce_all);
gpuStream_t stream = context.cuda_device_context().stream();
if (out_dtype >= 0) {
framework::VisitDataTypeSmall(
static_cast<framework::proto::VarType::Type>(out_dtype),
TensorReduceFunc<T, ReduceOp>(*input, output, reduce_dims, stream));
} else {
TensorReduceFunctorImpl<T, T, ReduceOp>(*input, output, reduce_dims,
stream);
}
}
};
#endif
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -16,18 +16,8 @@ ...@@ -16,18 +16,8 @@
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
// reduce_prod
#ifdef __HIPCC__
// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
// do not support double in HIPCC platform (Eigen3 to be fixed)
REGISTER_OP_CUDA_KERNEL(
reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
#else
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>, reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int, paddle::operators::CustomMul>, ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<double, paddle::operators::CustomMul>, ops::ReduceCudaKernel<double, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>); ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册