From 6b84688ba2f5a68d286ef4132d76895b545655fa Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 8 May 2019 19:53:30 +0800 Subject: [PATCH] Optimize the cuda implementation of sum_op (#17283) * Optimize the cuda implementation of sum_op, which add two lod_tensors inplace. test=develop * Use eigen to add to tensors. test=develop --- paddle/fluid/operators/sum_op.cu | 42 +++++++++++++++----------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 43427a4af20..5cecb7e09e7 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -87,7 +87,7 @@ __global__ void SumAlign4CUDAKernel(const T *in_0, const T *in_1, T *out, } template -void FuseLodTensorSumCompute(const framework::ExecutionContext &context) { +void SumToLoDTensor(const framework::ExecutionContext &context) { auto in_vars = context.MultiInputVar("X"); const size_t in_num = in_vars.size(); @@ -114,30 +114,29 @@ void FuseLodTensorSumCompute(const framework::ExecutionContext &context) { }; auto *out = context.Output("Out"); - - auto out_var = context.OutputVar("Out"); - bool in_place = in_vars[0] == out_var; - + bool in_place = in_vars[0] == context.OutputVar("Out"); if (!in_place) { out->mutable_data(context.GetPlace()); } - int start = in_place ? 1 : 0; - if (!in_place) { - // seperate path for a+b,maybe not fast than eigen - if (in_num == 2 && in_vars[0]->IsType() && - in_vars[1]->IsType()) { - auto &in_0 = in_vars[0]->Get(); - auto &in_1 = in_vars[1]->Get(); - auto length = in_0.numel(); - if (length) { - ComputeKernelParameter(length); - Sum2CUDAKernel<<>>( - in_0.data(), in_1.data(), out->data(), length); - } - return; + // Sum of two tensors + if (in_num == 2 && in_vars[0]->IsType() && + in_vars[1]->IsType()) { + auto &in_0 = in_vars[0]->Get(); + auto &in_1 = in_vars[1]->Get(); + + auto length = in_0.numel(); + if (length) { + auto result = EigenVector::Flatten(*out); + auto &place = *dev_ctx.eigen_device(); + auto in_0_e = EigenVector::Flatten(in_0); + auto in_1_e = EigenVector::Flatten(in_1); + result.device(place) = in_0_e + in_1_e; } + return; } + + int start = in_place ? 1 : 0; if (!in_place) { math::SetConstant constant_functor; constant_functor( @@ -228,13 +227,10 @@ class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto in_vars = context.MultiInputVar("X"); - const size_t in_num = in_vars.size(); auto out_var = context.OutputVar("Out"); - bool in_place = out_var == in_vars[0]; if (out_var->IsType()) { - FuseLodTensorSumCompute(context); + SumToLoDTensor(context); } else if (out_var->IsType()) { SelectedRowsCompute(context); } else if (out_var->IsType()) { -- GitLab