From e161979ed72736b57f72ddd344c9ff583342fe39 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Wed, 8 Jun 2022 17:02:23 +0800 Subject: [PATCH] Replace ReduceAmax/Amax.part.cu with KP (#43202) (#43263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce amax/amin frobenius_norm_kerne原始实现为Eigen实现,文件编译时间较长,因此本PR将其替换为KP实现 删除DefaultElementwiseOperator中重复功能支持,减少elementwise_double_grad OP编译时间 --- .../reduce_ops/reduce_amax_op.part.cu | 18 ++-- .../reduce_ops/reduce_amin_op.part.cu | 18 ++-- paddle/fluid/operators/reduce_ops/reduce_op.h | 96 ++++++++++++++++++- paddle/phi/kernels/funcs/broadcast_function.h | 19 +++- .../phi/kernels/gpu/frobenius_norm_kernel.cu | 22 ++++- 5 files changed, 150 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu index 27f2e2b70c6..ed6df1e558b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_amax_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceMaxGradKernel = + ops::ReduceCudaAMaxAMinGradKernel; +REGISTER_OP_CUDA_KERNEL(reduce_amax_grad, CUDAReduceMaxGradKernel, + CUDAReduceMaxGradKernel, + CUDAReduceMaxGradKernel, + CUDAReduceMaxGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu index a296c4c5d6f..69854da3c4f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_amin_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceMinGradKernel = + ops::ReduceCudaAMaxAMinGradKernel; +REGISTER_OP_CUDA_KERNEL(reduce_amin_grad, CUDAReduceMinGradKernel, + CUDAReduceMinGradKernel, + CUDAReduceMinGradKernel, + CUDAReduceMinGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index ff1ddb4175f..6fdbd95f03a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" #include "paddle/phi/kernels/funcs/math_function.h" - // only can include the headers in paddle/phi/api dirs #include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" @@ -649,6 +648,7 @@ class ReduceCudaGradKernel : public framework::OpKernel { bool reduce_all = context.Attr("reduce_all"); std::vector dims = context.Attr>("dim"); auto* in_x = context.Input("X"); + auto* d_out = context.Input(framework::GradVarName("Out")); auto* d_x = context.Output(framework::GradVarName("X")); @@ -679,12 +679,106 @@ class ReduceCudaGradKernel : public framework::OpKernel { if (out_dtype <= 0) { pt_out_dtype = d_out->dtype(); } + using MPType = typename kps::details::MPTypeTrait::Type; phi::ReduceGrad>( dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype, TransformOp(reduce_num)); } }; + +template +struct EqualFunctor { + inline T initial() { return static_cast(0.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { + return static_cast(a == b); + } +}; + +template +struct DivideFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } +}; + +template class TransformOp> +class ReduceCudaAMaxAMinGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + std::vector dims = context.Attr>("dim"); + auto* in_x = context.Input("X"); + auto* out_y = context.Input("Out"); + auto* d_out = + context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + auto out_dtype = context.Attr("in_dtype"); + auto pt_out_dtype = framework::TransToPhiDataType( + static_cast(out_dtype)); + // get reduce_dim and reduce_num for reduce_mean_grad + int dim_size = in_x->dims().size(); + std::vector reduce_dims = GetReduceDim(dims, dim_size, reduce_all); + auto update_dims = vectorize(d_x->dims()); + int reduce_num = 1; + for (auto i : reduce_dims) { + reduce_num *= (in_x->dims())[i]; + update_dims[i] = 1; + } + auto& dev_ctx = context.cuda_device_context(); + + // make new tensor reduce_out + phi::DenseTensor new_y(out_y->type()); + new_y.ShareDataWith(*out_y); + new_y.Resize(phi::make_ddim(update_dims)); + + // make new tensor d_out + phi::DenseTensor new_dout(d_out->type()); + new_dout.ShareDataWith(*d_out); + new_dout.Resize(phi::make_ddim(update_dims)); + d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype()); + + auto new_in = paddle::experimental::MakePhiDenseTensor(*in_x); + auto new_in_tensor = new_in.get(); + + auto new_dx = paddle::experimental::MakePhiDenseTensor(*d_x); + auto new_dx_tensor = new_dx.get(); + + // make equal_out + phi::DenseTensor* equal_out = new phi::DenseTensor(); + equal_out->Resize(in_x->dims()); + dev_ctx.template Alloc(equal_out); + auto equal_out_tensor = *equal_out; + + // make new tensor equal_count + phi::DenseTensor* equal_count = new phi::DenseTensor(); + equal_count->Resize(phi::make_ddim(update_dims)); + dev_ctx.template Alloc(equal_count); + + // compute + // 1. equal_out = Equal(x, y) + std::vector equal_inputs = {&new_y, new_in_tensor}; + std::vector equal_outputs = {&equal_out_tensor}; + phi::funcs::BroadcastKernel( + dev_ctx, equal_inputs, &equal_outputs, 0, EqualFunctor()); + // 2. equal_count = reduceSum(equal_out) + using MPType = typename kps::details::MPTypeTrait::Type; + phi::funcs::ReduceKernel>( + dev_ctx, equal_out_tensor, equal_count, + kps::IdentityFunctor(), reduce_dims, false); + + // 3. dx = Div(dout, equal_out) + std::vector grad_inputs = {&equal_out_tensor, + equal_count}; + std::vector grad_outputs = {new_dx_tensor}; + phi::funcs::BroadcastKernel( + dev_ctx, grad_inputs, &grad_outputs, 0, DivideFunctor()); + delete equal_out; + delete equal_count; + } +}; #endif } // namespace operators diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 514ecddfe24..a330baf9f61 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -522,7 +522,22 @@ void ElementwiseCompute(const GPUContext &dev_ctx, dev_ctx, ins, &outs, axis, func); } -#endif +template +void DefaultElementwiseOperator(const DeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + int axis = -1) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + dev_ctx.template Alloc(z); + funcs::ElementwiseCompute(dev_ctx, x, y, axis, Functor(), z); +} + +#else template +void FrobeniusNormKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); + std::vector ins = {out}; + std::vector outs = {out}; + auto functor = funcs::CudaSqrtFunctor(); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); +} + +} // namespace phi #include "paddle/phi/core/kernel_registry.h" -- GitLab