未验证 提交 6a9fac14 编写于 作者: N niuliling123 提交者: GitHub

modified reduce_sum_op and reduce_mean_op for higher_performance (#32885)

上级 bb01b120
......@@ -15,7 +15,6 @@
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
// reduce_prod
REGISTER_OP_CUDA_KERNEL(
reduce_all,
ops::ReduceCudaKernel<bool, paddle::operators::CustomLogicalAnd>);
......@@ -16,7 +16,6 @@
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
// reduce_prod
REGISTER_OP_CUDA_KERNEL(
reduce_any,
ops::ReduceCudaKernel<bool, paddle::operators::CustomLogicalOr>);
......@@ -13,58 +13,11 @@
// limitations under the License.
#include <vector>
#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace paddle {
namespace operators {
template <typename T>
struct DivideFunctor {
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
private:
T n_inv;
};
template <typename T>
class ReduceMeanKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
DivideFunctor<T>(reduce_num), stream);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<bool>,
ops::ReduceMeanKernel<float>,
ops::ReduceMeanKernel<double>);
REGISTER_OP_CUDA_KERNEL(
reduce_mean, ops::ReduceCudaKernel<bool, paddle::operators::CustomMean>,
ops::ReduceCudaKernel<float, paddle::operators::CustomMean>,
ops::ReduceCudaKernel<double, paddle::operators::CustomMean>);
......@@ -33,6 +33,7 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/fast_divmod.h"
......@@ -145,7 +146,6 @@ using Tensor = framework::Tensor;
constexpr int kMaxRank = framework::DDim::kMaxRank;
enum ReduceType {
kReduceAll = 0x00, // when reduce_rank == x_rank
kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1;
kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim
kReduceAny = 0x03, // when reduce_dim.size() > 1
......@@ -339,15 +339,11 @@ struct ReduceConfig {
void SetReduceType() {
int rank = x_dim.size();
int reduce_rank = reduce_dim.size();
bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) ||
(left_num > REDUCE_SPLIT_BOUNDARY);
if (rank == reduce_rank) {
reduce_type = static_cast<int>(ReduceType::kReduceAll);
} else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
bool is_last_dim =
(rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
if (rank == reduce_rank || is_last_dim) {
reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
} else if (reduce_rank == 1 &&
((rank == 2 && is_large_enough) || rank != 2)) {
} else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else {
......@@ -577,14 +573,15 @@ static __device__ T BlockYReduce(T val, ReduceOp reducer) {
// eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
// if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32
// else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
typename TransformOp>
__device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
TransformOp transformer, MPType init,
int reduce_num, int left_num, int block_size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * block_size;
Ty reduce_var = init;
MPType reduce_var = init;
if (idx < left_num) {
int loop = reduce_num - idy;
......@@ -592,24 +589,24 @@ __device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer,
for (int iy = 0; iy < loop; iy++) {
int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num;
reduce_var = reducer(reduce_var, static_cast<Ty>(transformer(x[id])));
reduce_var = reducer(reduce_var, static_cast<MPType>(transformer(x[id])));
}
y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] =
reduce_var;
static_cast<Ty>(reduce_var);
}
}
// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
// function will be used
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
typename ReduceIndexCal, typename LeftIndexCal>
template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
typename TransformOp>
__device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, int reduce_num,
TransformOp transformer, MPType init, int reduce_num,
int left_num, bool reduce_lastdim,
ReduceIndexCal reduce_index_calculator,
LeftIndexCal left_index_calculator) {
const IndexCalculator& reduce_index_calculator,
const IndexCalculator& left_index_calculator) {
int input_idx, left_idx, stride;
// the last dim gets involved in reduction
if (reduce_lastdim) {
......@@ -622,9 +619,9 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
stride = gridDim.y * blockDim.y;
}
// calculate the offset, means the addr where each thread really start.
int input_offset = left_index_calculator(left_idx);
int input_offset = left_index_calculator.Get(left_idx);
const Tx* input = x + input_offset;
Ty reduce_var = init;
MPType reduce_var = init;
// 1. reduce for each thread
if (left_idx < left_num) {
......@@ -635,12 +632,13 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
#pragma unroll
for (int i = 0; i < REDUCE_VEC_SIZE; ++i) {
int reduce_idx = input_idx + i * stride;
int idx_x = reduce_index_calculator(reduce_idx);
int idx_x = reduce_index_calculator.Get(reduce_idx);
input_reg[i] = input[idx_x];
}
#pragma unroll
for (int i = 0; i < REDUCE_VEC_SIZE; ++i) {
reduce_var = reducer(reduce_var, transformer(input_reg[i]));
reduce_var =
reducer(reduce_var, static_cast<MPType>(transformer(input_reg[i])));
}
input_idx += REDUCE_VEC_SIZE * stride;
}
......@@ -653,7 +651,7 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
break;
}
int reduce_idx = input_idx;
int idx_x = reduce_index_calculator(reduce_idx);
int idx_x = reduce_index_calculator.Get(reduce_idx);
input_reg[i] = input[idx_x];
input_idx += stride;
}
......@@ -663,7 +661,8 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
if (input_idx >= reduce_num) {
break;
}
reduce_var = reducer(reduce_var, transformer(input_reg[i]));
reduce_var =
reducer(reduce_var, static_cast<MPType>(transformer(input_reg[i])));
input_idx += stride;
}
}
......@@ -678,63 +677,56 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
// 3. reduce in block x
reduce_var = BlockXReduce(reduce_var, reducer);
if (left_idx < left_num && threadIdx.x == 0) {
y[blockIdx.y * left_num + left_idx] = reduce_var;
y[blockIdx.y * left_num + left_idx] = static_cast<Ty>(reduce_var);
}
} else {
if (left_idx < left_num && threadIdx.y == 0) {
y[blockIdx.y * left_num + left_idx] = reduce_var;
y[blockIdx.y * left_num + left_idx] = static_cast<Ty>(reduce_var);
}
}
}
// module function designed for global function
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
typename TransformOp>
__device__ void ReduceModule(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init, int reduce_num,
int left_num, int blocking_size, int reduce_type,
bool reduce_lastdim,
TransformOp transformer, MPType init,
int reduce_num, int left_num, int blocking_size,
int reduce_type, bool reduce_lastdim,
const IndexCalculator& reduce_index_calculator,
const IndexCalculator& left_index_calculator) {
if (reduce_type == ReduceType::kReduceLastDim) {
ReduceAny<Tx, Ty, ReduceOp, TransformOp>(
if (reduce_type == ReduceType::kReduceLastDim ||
reduce_type == ReduceType::kReduceAny) {
ReduceAny<Tx, Ty, MPType, ReduceOp, TransformOp>(
x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim,
[&](int idx) { return idx; },
[&](int idx) { return idx * reduce_num; });
reduce_index_calculator, left_index_calculator);
// reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
} else if (reduce_type == ReduceType::kReduceHigherDim) {
ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
ReduceHigherDim<Tx, Ty, MPType, ReduceOp, TransformOp>(
x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
// reduce_rank >= 2
} else {
ReduceAny<Tx, Ty, ReduceOp, TransformOp>(
x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim,
[&](int idx) { return reduce_index_calculator.Get(idx); },
[&](int idx) { return left_index_calculator.Get(idx); });
}
}
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
typename TransformOp>
__global__ void ReduceKernelFunction(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, Ty init,
TransformOp transformer, MPType init,
int reduce_num, int left_num,
int blocking_size, int reduce_type,
bool reduce_lastdim,
IndexCalculator reduce_index_calculator,
IndexCalculator left_index_calculator) {
ReduceModule<Tx, Ty, ReduceOp, TransformOp>(
ReduceModule<Tx, Ty, MPType, ReduceOp, TransformOp>(
x, y, reducer, transformer, init, reduce_num, left_num, blocking_size,
reduce_type, reduce_lastdim, reduce_index_calculator,
left_index_calculator);
}
template <typename Tx, typename Ty, typename ReduceOp>
template <typename Tx, typename Ty, typename MPType, typename ReduceOp>
static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
const ReduceOp& reducer, Ty init,
const ReduceOp& reducer, MPType init,
gpuStream_t stream, ReduceConfig<Ty> config) {
using TransformOp = typename ReduceOp::Transformer;
int reduce_rank = config.reduce_strides.size();
int left_rank = config.left_strides.size();
auto reduce_index_calculator = IndexCalculator(
......@@ -742,7 +734,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
auto left_index_calculator = IndexCalculator(
left_rank, config.left_dim, config.left_strides, config.x_strides);
ReduceKernelFunction<Tx, Ty, ReduceOp,
ReduceKernelFunction<Tx, Ty, MPType, ReduceOp,
TransformOp><<<config.grid, config.block, 0, stream>>>(
x_data, config.output_data, reducer, TransformOp(config.reduce_num), init,
config.reduce_num, config.left_num, config.blocking_size,
......@@ -760,10 +752,11 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
grid = dim3(config.grid.x, 1, config.grid.z);
}
ReduceKernelFunction<Ty, Ty, ReduceOp, detail::IdentityFunctor<
Ty>><<<grid, block, 0, stream>>>(
ReduceKernelFunction<
Ty, Ty, MPType, ReduceOp,
detail::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
config.output_data, y_data, reducer,
detail::IdentityFunctor<Ty>(config.grid.y), init, config.grid.y,
detail::IdentityFunctor<Ty, MPType>(config.grid.y), init, config.grid.y,
config.left_num, config.grid.y, ReduceType::kReduceHigherDim,
config.reduce_lastdim, reduce_index_calculator, left_index_calculator);
}
......@@ -794,11 +787,12 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
}
config.SetOutputData(y_data, x.place(), &tmp);
using TransformOp = typename ReduceOp<Tx, Ty>::Transformer;
auto reducer = ReduceOp<Tx, Ty>();
// launch CUB::Reduce
if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
bool use_cub_reduce = (config.left_num == 1) &&
(!std::is_same<Tx, paddle::platform::float16>::value);
if (use_cub_reduce) {
// launch CUB::Reduce
using TransformOp = typename ReduceOp<Tx, Ty>::Transformer;
auto reducer = ReduceOp<Tx, Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, TransformOp(config.reduce_num));
size_t temp_storage_bytes = 0;
......@@ -816,7 +810,9 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
return;
}
LaunchReduceKernel<Tx, Ty, ReduceOp<Tx, Ty>>(
using MPType = typename details::MPTypeTrait<Ty>::Type;
auto reducer = ReduceOp<Tx, MPType>();
LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<Tx, MPType>>(
x_data, y_data, reducer, reducer.initial(), stream, config);
}
......
......@@ -11,72 +11,18 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
namespace paddle {
namespace operators {
template <typename Tout>
struct IdentityFunctor {
HOSTDEVICE explicit inline IdentityFunctor() {}
template <typename U>
HOSTDEVICE inline Tout operator()(const U& x) const {
return static_cast<Tout>(x);
}
};
template <typename T>
class ReduceSumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto out_dtype = context.Attr<int>("out_dtype");
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
std::vector<int> reduce_dims;
if (reduce_all) {
reduce_dims.resize(input->dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
} else {
for (auto e : dims) {
reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
}
}
int reduce_num = 1;
for (int i = 0; i < reduce_dims.size(); ++i) {
reduce_num *= input->dims()[reduce_dims[i]];
}
auto stream = context.cuda_device_context().stream();
if (out_dtype >= 0) {
framework::VisitDataTypeSmall(
static_cast<framework::proto::VarType::Type>(out_dtype),
TensorReduceFunctor<T, cub::Sum, IdentityFunctor>(
*input, output, reduce_dims, static_cast<double>(0.0), cub::Sum(),
stream));
} else {
TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
IdentityFunctor<T>(), stream);
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
reduce_sum, ops::ReduceSumKernel<bool>, ops::ReduceSumKernel<float>,
ops::ReduceSumKernel<double>,
ops::ReduceSumKernel<paddle::platform::float16>, ops::ReduceSumKernel<int>,
ops::ReduceSumKernel<int64_t>,
ops::ReduceSumKernel<paddle::platform::complex<float>>,
ops::ReduceSumKernel<paddle::platform::complex<double>>);
reduce_sum, ops::ReduceCudaKernel<bool, paddle::operators::CustomSum>,
ops::ReduceCudaKernel<float, paddle::operators::CustomSum>,
ops::ReduceCudaKernel<double, paddle::operators::CustomSum>,
ops::ReduceCudaKernel<paddle::platform::float16,
paddle::operators::CustomSum>,
ops::ReduceCudaKernel<int, paddle::operators::CustomSum>,
ops::ReduceCudaKernel<int64_t, paddle::operators::CustomSum>,
ops::ReduceCudaKernel<paddle::platform::complex<float>,
paddle::operators::CustomSum>,
ops::ReduceCudaKernel<paddle::platform::complex<double>,
paddle::operators::CustomSum>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册