From 51c97d9f14048c60fa901f397e3ba540ec353226 Mon Sep 17 00:00:00 2001 From: Weilong Wu <87417304+veyron95@users.noreply.github.com> Date: Tue, 19 Oct 2021 19:37:06 +0800 Subject: [PATCH] Support elementwise_add triple grad Kernel (#36508) * Support elementwise_add triple grad Kernel * Change code-format to follow CI std --- .../elementwise/elementwise_add_op.cc | 47 ++++++++++++-- .../elementwise/elementwise_add_op.cu | 11 ++++ .../elementwise/elementwise_add_op.h | 39 ++++++++++++ .../operators/elementwise/elementwise_op.h | 61 +++++++++++++++++++ .../fluid/tests/unittests/gradient_checker.py | 12 +++- .../unittests/test_elementwise_nn_grad.py | 54 ++++++++++++++++ 6 files changed, 217 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 67e2e3a1e9..d66d6b66a0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -110,6 +110,25 @@ class ElementwiseAddDoubleGradMaker : public framework::SingleGradOpMaker { } }; +template +class ElementwiseAddTripleGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("elementwise_add_triple_grad"); + op->SetInput("DDX", this->Input("DDX")); + op->SetInput("DDY", this->Input("DDY")); + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput("D_DDX", this->InputGrad("DDX")); + op->SetOutput("D_DDY", this->InputGrad("DDY")); + } +}; + } // namespace operators } // namespace paddle @@ -123,10 +142,16 @@ REGISTER_OPERATOR( ops::ElementwiseAddDoubleGradMaker, ops::ElementwiseAddDoubleGradMaker); -REGISTER_OPERATOR(elementwise_add_grad_grad, - ops::ElementwiseOpDoubleGradWithoutDXDY, - ops::ElementwiseDoubleGradOpInplaceInferer, - ops::ElementwiseDoubleGradNoBufVarsInferer); +REGISTER_OPERATOR( + elementwise_add_grad_grad, ops::ElementwiseOpDoubleGradWithoutDXDY, + ops::ElementwiseDoubleGradOpInplaceInferer, + ops::ElementwiseDoubleGradNoBufVarsInferer, + ops::ElementwiseAddTripleGradMaker, + ops::ElementwiseAddTripleGradMaker); + +REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad, + ops::ElementwiseTripleGradOpInplaceInferer, + ops::ElementwiseTripleGradNoBufVarsInferer); REGISTER_OP_CPU_KERNEL( elementwise_add, @@ -162,6 +187,20 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CPU_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 331867617b..0b78aa4a01 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -196,6 +196,17 @@ REGISTER_OP_CUDA_KERNEL( plat::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 6c61ce61ee..0ce4ca665d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -205,5 +205,44 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel { } }; +template +class ElementwiseAddTripleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using Tensor = framework::Tensor; + auto *ddx = ctx.Input("DDX"); + auto *ddy = ctx.Input("DDY"); + auto *d_ddout = ctx.Input("D_DDOut"); + auto *d_ddx = ctx.Output("D_DDX"); + auto *d_ddy = ctx.Output("D_DDY"); + // skip out + auto *out = d_ddout; + + // Special case when d_ddy is not needed and d_ddx doesn't reduce + if (d_ddx != nullptr && d_ddy == nullptr && + d_ddx->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddy is not needed and d_ddx doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddx); + } else if (d_ddx == nullptr && d_ddy != nullptr && + d_ddy->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddx is not needed and d_ddy doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddy); + } else if (d_ddx != nullptr && d_ddy != nullptr && + (d_ddx->dims() == d_ddy->dims())) { + elementwise_add_grad(ctx, ddx, ddy, out, d_ddout, d_ddx, + d_ddy); + } else { + default_elementwise_add_grad(ctx, ddx, ddy, out, + d_ddout, d_ddx, d_ddy); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 3614602156..5703e904c2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -426,6 +426,62 @@ class ElementwiseOpDoubleGradWithoutDXDY } }; +class ElementwiseOpTripleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->HasOutput("D_DDX")) { + ctx->ShareDim("DDX", "D_DDX"); + ctx->ShareLoD("DDX", "D_DDX"); + } + if (ctx->HasOutput("D_DDY")) { + ctx->ShareDim("DDY", "D_DDY"); + ctx->ShareLoD("DDY", "D_DDY"); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::proto::VarType::Type input_data_type; + if (ctx.HasInput("DDX") == false) { + OP_INOUT_CHECK(ctx.HasInput("DDY"), "Input", "DDY", + "ElementwiseOpTripleGrad"); + input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDY"); + } else if (ctx.HasInput("DDY") == false) { + OP_INOUT_CHECK(ctx.HasInput("DDX"), "Input", "DDX", + "ElementwiseOpTripleGrad"); + input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX"); + } else { + input_data_type = + OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY"); + } + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + if (framework::IsComplexType(expected_kernel_type.data_type_)) { + // only promote inputs’s types when contains complex input + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } + } +}; + template class ElemwiseGradKernel : public framework::OpKernel { public: @@ -447,9 +503,14 @@ DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplaceInferer, DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplaceInferer, {"DDX", "DDOut"}); +DECLARE_INPLACE_OP_INFERER(ElementwiseTripleGradOpInplaceInferer, + {"D_DDOut", "D_DDX"}); + DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseGradNoBufVarsInferer, "X", "Y"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseDoubleGradNoBufVarsInferer, "Y", "DOut"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseTripleGradNoBufVarsInferer, + "DDX", "DDY"); } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 01aa2fd9ef..b56bbc07a7 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -486,20 +486,26 @@ def triple_grad_check(x, var_to_np_array_in_scope(scope, place, v.name) for v in x_grads_grads ] - # append second order grads - target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) x += y_grads x_init = _as_list(x_init) x_init += y_grads_init + # append second order grads + target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) + + # filter None in target_grads_grads for Dy/Dx may be None in kernel + filted = [(i, dyi) for i, dyi in enumerate(target_grads_grads) + if dyi is not None] + filted_idx, filted_target_grads_grads = zip(*filted) + x += x_grads_grads x_init += x_grads_grads_init # x <=> [x, dout, ddx] grad_check( x=x, - y=target_grads_grads, + y=filted_target_grads_grads, x_init=x_init, place=place, program=program, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py index 12b75c8bf7..0dba2b1924 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py @@ -243,5 +243,59 @@ class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase): self.func(p) +class TestElementwiseAddTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + # the shape of input variable should be clearly specified, not inlcude -1. + shape = [2, 3, 4, 5] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + y = layers.data('y', shape, False, dtype) + x.persistable = True + y.persistable = True + out = layers.elementwise_add(x, y) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + y_arr = np.random.uniform(-1, 1, shape).astype(dtype) + + gradient_checker.triple_grad_check( + [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + +class TestElementwiseAddBroadcastTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + # the shape of input variable should be clearly specified, not inlcude -1. + shape = [2, 3, 4, 5] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + y = layers.data('y', shape[:-1], False, dtype) + x.persistable = True + y.persistable = True + out = layers.elementwise_add(x, y, axis=0) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype) + + gradient_checker.triple_grad_check( + [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + if __name__ == "__main__": unittest.main() -- GitLab