未验证 提交 c711e913 编写于 作者: Z Zhanlue Yang 提交者: GitHub

Add double grad op for sigmoid activation, test=develop (#32971)

Sigmoid: Out = Sigmoid(X)
SigmoidGrad: DX = DOut*(1-Out)*Out

[This Patch]
Out
DOut -> SigmoidGradGrad -> DOutNew
DDX                        DDOut

DDOut = (1-Out)*Out*DDX
DOutNew = (1-2*Out)*DOut*DDX
上级 20b9be65
......@@ -789,6 +789,27 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
}
};
template <typename T>
class SigmoidDoubleGradMaker
: public ::paddle::framework::SingleGradOpMaker<T> {
public:
using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("sigmoid_grad_grad");
// input1: Out
op->SetInput("Out", this->Input("Out"));
// input2: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
op->SetAttrMap(this->Attrs());
// output: ddy
op->SetOutput("DOutNew", this->InputGrad("Out"));
op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
}
};
template <typename T>
class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public:
......@@ -1068,6 +1089,47 @@ namespace plat = paddle::platform;
FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
/* ========================== sigmoid register =============================
*/
// 1. Register Sigmoid Operator
REGISTER_OPERATOR(
sigmoid, ops::ActivationOp, ops::SigmoidOpMaker,
ops::ActivationOpInferVarType,
ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
paddle::framework::OpDesc>,
ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
paddle::imperative::OpBase>,
std::conditional<ops::CanInplaceAct<ops::SigmoidGradFunctor<float>>(),
ops::ActFwdInplaceInferer, void>::type);
// 2. Register Sigmoid Grad Operator
REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad,
ops::ActivationGradOpInplaceInferer,
ops::SigmoidDoubleGradMaker<paddle::framework::OpDesc>,
ops::SigmoidDoubleGradMaker<paddle::imperative::OpBase>)
// 3. Register Sigmoid DoubleGrad Operator
REGISTER_OPERATOR(
sigmoid_grad_grad,
ops::ActivationOpDoubleGrad<ops::SigmoidGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
// Register Sigmoid/GradSigmoid Kernels
REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
SigmoidGradFunctor);
// Register DoubleGrad Kernel
REGISTER_OP_CPU_KERNEL(
sigmoid_grad_grad,
ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
ops::SigmoidGradGradFunctor<float>>,
ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
ops::SigmoidGradGradFunctor<double>>,
ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
ops::SigmoidGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ========================== tanh register ============================= */
REGISTER_OPERATOR(
tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType,
......
......@@ -1481,6 +1481,21 @@ REGISTER_OP_CUDA_KERNEL(
#endif
/* ========================================================================== */
/* =========================== sigmoid register ============================
*/
REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
CudaSigmoidGradFunctor);
REGISTER_OP_CUDA_KERNEL(
sigmoid_grad_grad,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<float>>,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<double>>,
ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
ops::SigmoidGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* =========================== tanh register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
CudaTanhGradFunctor);
......@@ -1595,7 +1610,6 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
__macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor); \
__macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \
CudaLogSigmoidGradFunctor); \
......
......@@ -258,6 +258,43 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
};
/*
Out
DOut -> SigmoidGradGrad -> DOutNew
DDX DDOut
DDOut = (1-Out)*Out*DDX
DOutNew = (1-2*Out)*DOut*DDX
*/
template <typename T>
struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* Out,
const framework::Tensor* ddX, const framework::Tensor* dOut,
framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
auto out = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
if (dOutNew) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
auto dout_new = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad"));
dout_new.device(*d) =
(static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
};
// silu(x) = x / (1 + exp(-x))
template <typename T>
struct SiluFunctor : public BaseActivationFunctor<T> {
......@@ -1789,6 +1826,50 @@ inline void ExtractDoubleGradTensorWithInputDOut(
}
}
template <typename DeviceContext, typename Functor>
class SigmoidDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& ctx) const override {
const framework::Tensor *Out, *ddX, *dOut;
framework::Tensor *dOutNew, *ddOut;
Out = ddX = dOut = nullptr;
dOutNew = ddOut = nullptr;
// extract ddx(input) and out(input)
ddX = ctx.Input<framework::Tensor>("DDX");
Out = ctx.Input<framework::Tensor>("Out");
PADDLE_ENFORCE_NOT_NULL(
ddX, platform::errors::NotFound(
"Cannot get input Variable ddX, variable name = %s",
ctx.InputName("DDX")));
PADDLE_ENFORCE_NOT_NULL(
Out, platform::errors::NotFound(
"Cannot get input Variable Out, variable name = %s",
ctx.InputName("Out")));
// set output ddout
ddOut = ctx.Output<framework::Tensor>("DDOut");
// extract dOut(intput)
dOut = ctx.Input<framework::Tensor>("DOut");
PADDLE_ENFORCE_NOT_NULL(
dOut, platform::errors::NotFound(
"Cannot get input Variable dOut, variable name = %s",
ctx.InputName("DOut")));
// set output dout_new
dOutNew = ctx.Output<framework::Tensor>("DOutNew");
if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
auto& place = ctx.template device_context<DeviceContext>();
Functor functor;
functor(place, Out, ddX, dOut, dOutNew, ddOut);
}
};
template <typename DeviceContext, typename Functor>
class TanhDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
......@@ -2153,7 +2234,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
} // namespace paddle
#define FOR_EACH_ACTIVATION_OP(__macro) \
__macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \
__macro(silu, Silu, SiluFunctor, SiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(atan, Atan, AtanFunctor, AtanGradFunctor); \
......
......@@ -26,6 +26,28 @@ import gradient_checker
from decorator_helper import prog_scope
class TestSigmoidDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
shape = [2, 3, 7, 9]
eps = 0.0005
dtype = np.float64
x = layers.data('x', shape, False, dtype=dtype)
x.persistable = True
y = layers.sigmoid(x)
x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
x_arr[np.abs(x_arr) < 0.005] = 0.002
gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
class TestTanhDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册