diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 154c618e8e7c4650b7f22684d3357de9c52a416c..83262f950e1606cc367ea30b089a983db9848988 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -44,9 +44,9 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Sigmoid operator"); AddOutput("Y", "Output of Sigmoid operator"); AddComment(R"DOC( -Sigmoid Activation Operator. +Sigmoid Activation Operator -$y = 1 / (1 + e^{-x})$ +$$y = \frac{1}{1 + e^{-x}}$$ )DOC"); } @@ -60,9 +60,9 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of LogSigmoid operator"); AddOutput("Y", "Output of LogSigmoid operator"); AddComment(R"DOC( -Logsigmoid Activation Operator. +Logsigmoid Activation Operator -$y = \log(1 / (1 + e^{-x}))$ +$$y = \log \frac{1}{1 + e^{-x}}$$ )DOC"); } diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index d5bbc672e18f392d6a91383b919fefc4b2d8ff0e..867ddd9790722be479ab2cd7c14425ea399c18f9 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -107,10 +107,12 @@ Adam algorithm based on the infinity norm. Adamax updates: -$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break -infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break -learningRate = learningRate /(1 - \beta_1_{pow}) \break -paramOut = param - learningRate * momentPut / infNormOut$$ +$$ + momentOut = \beta_{1} * moment + (1 - \beta_{1}) * grad \\ + infNormOut = max(\beta_{2} * infNorm + \epsilon, |grad|) \\ + learningRate = \frac{learningRate}{1 - \beta_{1}^{Beta1Pow}} \\ + paramOut = param - learningRate * \frac{momentOut}{infNormOut} +$$ The original paper does not have an epsilon attribute. However, it is added here for numerical stability to prevent the