From 808be6574a46e552688acdd3066e271598c4f132 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Fri, 15 Oct 2021 11:59:29 +0800 Subject: [PATCH] [New Feature] Support tanh triple grad (#36225) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * add tanh triple grad * format python code * refine code Co-authored-by: veyron95 Co-authored-by: levi131 --- paddle/fluid/operators/activation_op.cc | 46 ++++++- paddle/fluid/operators/activation_op.cu | 9 ++ paddle/fluid/operators/activation_op.h | 112 ++++++++++++++++++ .../unittests/test_activation_nn_grad.py | 22 ++++ 4 files changed, 188 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 3cdcfd79235..5e5cd0ea1c5 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -940,6 +940,34 @@ class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { } }; +template +class TanhTripleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("tanh_triple_grad"); + // Out, DDX, DOut, D_DDOut, D_DOut_New // input + // D_OutNew, D_DOut, D_DDx // output + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->Input("DDX")); + // input3: dout + op->SetInput("DOut", this->Input("DOut")); + // input4: d_ddout + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + // input5: d_dout_new + op->SetInput("D_DOut_New", this->OutputGrad("DOutNew")); + op->SetAttrMap(this->Attrs()); + + // output: d_dOut, d_OutNew, d_ddx + op->SetOutput("D_OutNew", this->InputGrad("Out")); + op->SetOutput("D_DOut", this->InputGrad("DOut")); + op->SetOutput("D_DDx", this->InputGrad("DDX")); + } +}; // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 template @@ -1299,7 +1327,14 @@ REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad, REGISTER_OPERATOR( tanh_grad_grad, ops::ActivationOpDoubleGrad::FwdDeps()>, - ops::ActivationDoubleGradOpInplaceInferer); + ops::ActivationDoubleGradOpInplaceInferer, + ops::TanhTripleGradMaker, + ops::TanhTripleGradMaker); + +REGISTER_OPERATOR( + tanh_triple_grad, + ops::ActivationOpTripleGrad::FwdDeps()>, + ops::ActivationTripleGradOpInplaceInferer); REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); REGISTER_OP_CPU_KERNEL( @@ -1309,6 +1344,15 @@ REGISTER_OP_CPU_KERNEL( ops::TanhGradGradFunctor>, ops::TanhDoubleGradKernel>); +// Register TripleGrad Kernel +REGISTER_OP_CPU_KERNEL( + tanh_triple_grad, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index d83a63015cf..cde8e9a4507 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1487,6 +1487,15 @@ REGISTER_OP_CUDA_KERNEL( ops::TanhGradGradFunctor>, ops::TanhDoubleGradKernel>); + +REGISTER_OP_CUDA_KERNEL( + tanh_triple_grad, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>); /* ========================================================================== */ /* =========================== sqrt register ============================= */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index a6240c038b1..627522e1da0 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -536,6 +536,61 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +/* + Out + DOut D_Dout + DDx -> TanhTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (-2) * Out * DDx * D_Dout_new + D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new + D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct TanhTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + const framework::Tensor* d_DDOut, + const framework::Tensor* d_dOut_New, + framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, + framework::Tensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); + auto d_ddOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); + auto d_dOutNew = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); + d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - + (static_cast(2) * dout * ddx * d_dOutNew); + } + if (d_d_Out) { + auto d_dOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); + d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); + d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - + static_cast(2) * out * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) @@ -2137,6 +2192,63 @@ class TanhDoubleGradKernel functor(place, Out, ddX, dOut, dOutNew, ddOut); } }; + +template +class TanhTripeGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; + framework::Tensor *d_OutNew, *d_dOut, *d_ddx; + Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; + d_OutNew = d_dOut = d_ddx = nullptr; + + // extract ddx(input), out(input), dOut(input), d_ddOut(input), + // d_dOutNew(input) + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + dOut = ctx.Input("DOut"); + d_ddOut = ctx.Input("D_DDOut"); + d_dOutNew = ctx.Input("D_DOut_New"); + + PADDLE_ENFORCE_NOT_NULL( + ddX, platform::errors::NotFound( + "Cannot get input Variable ddX, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + Out, platform::errors::NotFound( + "Cannot get input Variable Out, variable name = %s", + ctx.InputName("Out"))); + PADDLE_ENFORCE_NOT_NULL( + dOut, platform::errors::NotFound( + "Cannot get input Variable dOut, variable name = %s", + ctx.InputName("DOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_ddOut, platform::errors::NotFound( + "Cannot get input Variable d_ddOut, variable name = %s", + ctx.InputName("D_DDOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_dOutNew, + platform::errors::NotFound( + "Cannot get input Variable d_dOutNew, variable name = %s", + ctx.InputName("D_DOutNew"))); + + // set output d_OutNew、d_dOut、d_ddx + d_dOut = ctx.Output("D_DOut"); + d_OutNew = ctx.Output("D_OutNew"); + d_ddx = ctx.Output("D_DDx"); + + if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input + d_dOut, d_OutNew, d_ddx); // output + } +}; + template class SquareDoubleGradKernel : public framework::OpKernel { diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index c54f711c7ce..825d74388bc 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -71,6 +71,28 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase): self.func(p) +class TestTanhTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0005 + dtype = np.float64 + x = layers.data('x', shape, False, dtype=dtype) + x.persistable = True + y = layers.tanh(x) + x_arr = np.random.random(shape).astype(dtype) + x_arr[np.abs(x_arr) < 0.005] = 0.002 + gradient_checker.triple_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestTanhDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): -- GitLab