diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 99a5caaad6ab802facaec6a3b5c4c5e2384945d4..674326f90c504d2944a9a8d0ec26a099d0eba271 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -15,7 +15,6 @@ #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" -// reduce_prod REGISTER_OP_CUDA_KERNEL( reduce_all, ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index c7eafa2ac8760a3edde56a9f2411c6faaac454f1..b7b0eb598249b1d330f8c6a827c55887287702fe 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -16,7 +16,6 @@ #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" -// reduce_prod REGISTER_OP_CUDA_KERNEL( reduce_any, ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu index 50d2fcdee23bd9e830f32e0cff4d367c3ad5ba66..b5d5bb33d0a8804fa30720bc2884a029714bab74 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu @@ -13,58 +13,11 @@ // limitations under the License. #include -#include "paddle/fluid/operators/reduce_ops/cub_reduce.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -namespace paddle { -namespace operators { - -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } - - private: - T n_inv; -}; - -template -class ReduceMeanKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - DivideFunctor(reduce_num), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, - ops::ReduceMeanKernel, - ops::ReduceMeanKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_mean, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 2dfc2af94116cbe4021cf51ab86cd5ef68d14932..fe77d3158ed27c93c5226e4c55941616b84407e1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -33,6 +33,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/fast_divmod.h" @@ -145,7 +146,6 @@ using Tensor = framework::Tensor; constexpr int kMaxRank = framework::DDim::kMaxRank; enum ReduceType { - kReduceAll = 0x00, // when reduce_rank == x_rank kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim kReduceAny = 0x03, // when reduce_dim.size() > 1 @@ -339,15 +339,11 @@ struct ReduceConfig { void SetReduceType() { int rank = x_dim.size(); int reduce_rank = reduce_dim.size(); - bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) || - (left_num > REDUCE_SPLIT_BOUNDARY); - - if (rank == reduce_rank) { - reduce_type = static_cast(ReduceType::kReduceAll); - } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { + bool is_last_dim = + (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); + if (rank == reduce_rank || is_last_dim) { reduce_type = static_cast(ReduceType::kReduceLastDim); - } else if (reduce_rank == 1 && - ((rank == 2 && is_large_enough) || rank != 2)) { + } else if (reduce_rank == 1) { // ReduceFirstDim and reduceSecondDim reduce_type = static_cast(ReduceType::kReduceHigherDim); } else { @@ -577,14 +573,15 @@ static __device__ T BlockYReduce(T val, ReduceOp reducer) { // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 // if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32 // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 -template +template __device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, + TransformOp transformer, MPType init, int reduce_num, int left_num, int block_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int idy = blockIdx.y * block_size; - Ty reduce_var = init; + MPType reduce_var = init; if (idx < left_num) { int loop = reduce_num - idy; @@ -592,24 +589,24 @@ __device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, for (int iy = 0; iy < loop; iy++) { int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num; - reduce_var = reducer(reduce_var, static_cast(transformer(x[id]))); + reduce_var = reducer(reduce_var, static_cast(transformer(x[id]))); } y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] = - reduce_var; + static_cast(reduce_var); } } // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used -template +template __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, int reduce_num, + TransformOp transformer, MPType init, int reduce_num, int left_num, bool reduce_lastdim, - ReduceIndexCal reduce_index_calculator, - LeftIndexCal left_index_calculator) { + const IndexCalculator& reduce_index_calculator, + const IndexCalculator& left_index_calculator) { int input_idx, left_idx, stride; // the last dim gets involved in reduction if (reduce_lastdim) { @@ -622,9 +619,9 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, stride = gridDim.y * blockDim.y; } // calculate the offset, means the addr where each thread really start. - int input_offset = left_index_calculator(left_idx); + int input_offset = left_index_calculator.Get(left_idx); const Tx* input = x + input_offset; - Ty reduce_var = init; + MPType reduce_var = init; // 1. reduce for each thread if (left_idx < left_num) { @@ -635,12 +632,13 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, #pragma unroll for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { int reduce_idx = input_idx + i * stride; - int idx_x = reduce_index_calculator(reduce_idx); + int idx_x = reduce_index_calculator.Get(reduce_idx); input_reg[i] = input[idx_x]; } #pragma unroll for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { - reduce_var = reducer(reduce_var, transformer(input_reg[i])); + reduce_var = + reducer(reduce_var, static_cast(transformer(input_reg[i]))); } input_idx += REDUCE_VEC_SIZE * stride; } @@ -653,7 +651,7 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, break; } int reduce_idx = input_idx; - int idx_x = reduce_index_calculator(reduce_idx); + int idx_x = reduce_index_calculator.Get(reduce_idx); input_reg[i] = input[idx_x]; input_idx += stride; } @@ -663,7 +661,8 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, if (input_idx >= reduce_num) { break; } - reduce_var = reducer(reduce_var, transformer(input_reg[i])); + reduce_var = + reducer(reduce_var, static_cast(transformer(input_reg[i]))); input_idx += stride; } } @@ -678,63 +677,56 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, // 3. reduce in block x reduce_var = BlockXReduce(reduce_var, reducer); if (left_idx < left_num && threadIdx.x == 0) { - y[blockIdx.y * left_num + left_idx] = reduce_var; + y[blockIdx.y * left_num + left_idx] = static_cast(reduce_var); } } else { if (left_idx < left_num && threadIdx.y == 0) { - y[blockIdx.y * left_num + left_idx] = reduce_var; + y[blockIdx.y * left_num + left_idx] = static_cast(reduce_var); } } } // module function designed for global function -template +template __device__ void ReduceModule(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, int reduce_num, - int left_num, int blocking_size, int reduce_type, - bool reduce_lastdim, + TransformOp transformer, MPType init, + int reduce_num, int left_num, int blocking_size, + int reduce_type, bool reduce_lastdim, const IndexCalculator& reduce_index_calculator, const IndexCalculator& left_index_calculator) { - if (reduce_type == ReduceType::kReduceLastDim) { - ReduceAny( + if (reduce_type == ReduceType::kReduceLastDim || + reduce_type == ReduceType::kReduceAny) { + ReduceAny( x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim, - [&](int idx) { return idx; }, - [&](int idx) { return idx * reduce_num; }); - + reduce_index_calculator, left_index_calculator); // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1 } else if (reduce_type == ReduceType::kReduceHigherDim) { - ReduceHigherDim( + ReduceHigherDim( x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); - - // reduce_rank >= 2 - } else { - ReduceAny( - x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim, - [&](int idx) { return reduce_index_calculator.Get(idx); }, - [&](int idx) { return left_index_calculator.Get(idx); }); } } -template +template __global__ void ReduceKernelFunction(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, + TransformOp transformer, MPType init, int reduce_num, int left_num, int blocking_size, int reduce_type, bool reduce_lastdim, IndexCalculator reduce_index_calculator, IndexCalculator left_index_calculator) { - ReduceModule( + ReduceModule( x, y, reducer, transformer, init, reduce_num, left_num, blocking_size, reduce_type, reduce_lastdim, reduce_index_calculator, left_index_calculator); } -template +template static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, - const ReduceOp& reducer, Ty init, + const ReduceOp& reducer, MPType init, gpuStream_t stream, ReduceConfig config) { using TransformOp = typename ReduceOp::Transformer; - int reduce_rank = config.reduce_strides.size(); int left_rank = config.left_strides.size(); auto reduce_index_calculator = IndexCalculator( @@ -742,7 +734,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, auto left_index_calculator = IndexCalculator( left_rank, config.left_dim, config.left_strides, config.x_strides); - ReduceKernelFunction<<>>( x_data, config.output_data, reducer, TransformOp(config.reduce_num), init, config.reduce_num, config.left_num, config.blocking_size, @@ -760,10 +752,11 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, grid = dim3(config.grid.x, 1, config.grid.z); } - ReduceKernelFunction><<>>( + ReduceKernelFunction< + Ty, Ty, MPType, ReduceOp, + detail::IdentityFunctor><<>>( config.output_data, y_data, reducer, - detail::IdentityFunctor(config.grid.y), init, config.grid.y, + detail::IdentityFunctor(config.grid.y), init, config.grid.y, config.left_num, config.grid.y, ReduceType::kReduceHigherDim, config.reduce_lastdim, reduce_index_calculator, left_index_calculator); } @@ -794,11 +787,12 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, } config.SetOutputData(y_data, x.place(), &tmp); - - using TransformOp = typename ReduceOp::Transformer; - auto reducer = ReduceOp(); - // launch CUB::Reduce - if (config.reduce_type == static_cast(ReduceType::kReduceAll)) { + bool use_cub_reduce = (config.left_num == 1) && + (!std::is_same::value); + if (use_cub_reduce) { + // launch CUB::Reduce + using TransformOp = typename ReduceOp::Transformer; + auto reducer = ReduceOp(); cub::TransformInputIterator trans_x( x_data, TransformOp(config.reduce_num)); size_t temp_storage_bytes = 0; @@ -816,7 +810,9 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, return; } - LaunchReduceKernel>( + using MPType = typename details::MPTypeTrait::Type; + auto reducer = ReduceOp(); + LaunchReduceKernel>( x_data, y_data, reducer, reducer.initial(), stream, config); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu index efbafe4aa8c3e0f538b972c5f1b2f8f83e11d4a6..27a29a5b09505648a0716f43d12285497cf91678 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu @@ -11,72 +11,18 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/cub_reduce.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" - -namespace paddle { -namespace operators { - -template -struct IdentityFunctor { - HOSTDEVICE explicit inline IdentityFunctor() {} - - template - HOSTDEVICE inline Tout operator()(const U& x) const { - return static_cast(x); - } -}; - -template -class ReduceSumKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto out_dtype = context.Attr("out_dtype"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - if (out_dtype >= 0) { - framework::VisitDataTypeSmall( - static_cast(out_dtype), - TensorReduceFunctor( - *input, output, reduce_dims, static_cast(0.0), cub::Sum(), - stream)); - } else { - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - IdentityFunctor(), stream); - } - } -}; - -} // namespace operators -} // namespace paddle - REGISTER_OP_CUDA_KERNEL( - reduce_sum, ops::ReduceSumKernel, ops::ReduceSumKernel, - ops::ReduceSumKernel, - ops::ReduceSumKernel, ops::ReduceSumKernel, - ops::ReduceSumKernel, - ops::ReduceSumKernel>, - ops::ReduceSumKernel>); + reduce_sum, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + paddle::operators::CustomSum>, + ops::ReduceCudaKernel, + paddle::operators::CustomSum>);