diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 483f9888973edc9db6317723c136778d40cc7878..83d35a450d0e8ebf5311cdfd948b066642ccec8c 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Sigmoid operator"); AddOutput("Y", "Output of Sigmoid operator"); AddComment(R"DOC( -Sigmoid activation operator. +Sigmoid Activation Operator. $y = 1 / (1 + e^{-x})$ @@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of LogSigmoid operator"); AddOutput("Y", "Output of LogSigmoid operator"); AddComment(R"DOC( -Logsigmoid activation operator. +Logsigmoid Activation Operator. $y = \log(1 / (1 + e^{-x}))$ @@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Exp operator"); AddOutput("Y", "Output of Exp operator"); AddComment(R"DOC( -Exp activation operator. +Exp Activation Operator. $y = e^x$ @@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Relu operator"); AddOutput("Y", "Output of Relu operator"); AddComment(R"DOC( -Relu activation operator. +Relu Activation Operator. $y = \max(x, 0)$ @@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("alpha", "The small negative slope") .SetDefault(static_cast(0.02f)); AddComment(R"DOC( -LeakyRelu activation operator. +LeakyRelu Activation Operator. $y = \max(x, \alpha * x)$ @@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("lambda", "non-negative offset") .SetDefault(static_cast(0.5f)); AddComment(R"DOC( -Softshrink activation operator. +Softshrink Activation Operator. $$ y = \begin{cases} @@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Tanh operator"); AddOutput("Y", "Output of Tanh operator"); AddComment(R"DOC( -Tanh activation operator. +Tanh Activation Operator. $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ @@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of TanhShrink operator"); AddOutput("Y", "Output of TanhShrink operator"); AddComment(R"DOC( -TanhShrink activation operator. +TanhShrink Activation Operator. $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ @@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("threshold", "The value of threshold for HardShrink") .SetDefault(static_cast(0.5)); AddComment(R"DOC( -HardShrink activation operator. +HardShrink Activation Operator. $$ y = \begin{cases} @@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Sqrt operator"); AddOutput("Y", "Output of Sqrt operator"); AddComment(R"DOC( -Sqrt activation operator. +Sqrt Activation Operator. $y = \sqrt{x}$ @@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Abs operator"); AddOutput("Y", "Output of Abs operator"); AddComment(R"DOC( -Abs activation operator. +Abs Activation Operator. $y = |x|$ @@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Reciprocal operator"); AddOutput("Y", "Output of Reciprocal operator"); AddComment(R"DOC( -Reciprocal activation operator. +Reciprocal Activation Operator. $$y = \frac{1}{x}$$ @@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Log operator"); AddOutput("Y", "Output of Log operator"); AddComment(R"DOC( -Log activation operator. +Log Activation Operator. $y = \ln(x)$ @@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Square operator"); AddOutput("Y", "Output of Square operator"); AddComment(R"DOC( -Square activation operator. +Square Activation Operator. $y = x^2$ @@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Softplus operator"); AddOutput("Y", "Output of Softplus operator"); AddComment(R"DOC( -Softplus activation operator. +Softplus Activation Operator. $y = \ln(1 + e^{x})$ @@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Softsign operator"); AddOutput("Y", "Output of Softsign operator"); AddComment(R"DOC( -Softsign activation operator. +Softsign Activation Operator. $$y = \frac{x}{1 + |x|}$$ @@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("t_max", "The max marginal value of BRelu") .SetDefault(static_cast(24)); AddComment(R"DOC( -BRelu activation operator. +BRelu Activation Operator. $y = \max(\min(x, t_{min}), t_{max})$ @@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("threshold", "The threshold value of SoftRelu") .SetDefault(static_cast(40)); AddComment(R"DOC( -SoftRelu activation operator. +SoftRelu Activation Operator. $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$ @@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("alpha", "The alpha value of ELU") .SetDefault(static_cast(1.0f)); AddComment(R"DOC( -ELU activation operator. +ELU Activation Operator. Applies the following element-wise computation on the input according to https://arxiv.org/abs/1511.07289. @@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("threshold", "The threshold value of Relu6") .SetDefault(static_cast(6)); AddComment(R"DOC( -Relu6 activation operator. +Relu6 Activation Operator. $y = \min(\max(0, x), 6)$ @@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("factor", "The exponential factor of Pow") .SetDefault(static_cast(1)); AddComment(R"DOC( -Pow activation operator. +Pow Activation Operator. $y = x^{factor}$ @@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("scale_b", "The scale parameter of b for the input") .SetDefault(static_cast(1.7159)); AddComment(R"DOC( -STanh activation operator. +STanh Activation Operator. $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ @@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("threshold", "The threshold location of activation") .SetDefault(static_cast(1.0)); AddComment(R"DOC( -ThresholdedRelu activation operator. +ThresholdedRelu Activation Operator. $$ y = \begin{cases} @@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("offset", "Offset for linear approximation of sigmoid") .SetDefault(static_cast(0.5)); AddComment(R"DOC( -HardSigmoid activation operator. +HardSigmoid Activation Operator. Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), which is much faster than sigmoid. diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc index 638a99addc2119e8f44648cc54b97bd8a892d2bc..d7e8a0ea7632650203106b01531d724cf0b8e085 100644 --- a/paddle/operators/margin_rank_loss_op.cc +++ b/paddle/operators/margin_rank_loss_op.cc @@ -55,8 +55,6 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker { "(2-D tensor with shape [batch_size x 1]) " "The label indicating X1 ranked higher than X2 or not, " "can only be +1 or -1."); - AddAttr("margin", "(scalar, default 0) Margin for MarginRankLossOp.") - .SetDefault(static_cast(0)); AddOutput("Activated", "(2-D tensor with shape [batch_size x 1]) Intermediate tensor " "to indicate whether each element of Output(Out) is activated.") @@ -64,23 +62,26 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(2-D tensor with shape [batch_size x 1]) " "The output loss of MarginRankLoss operator."); + AddAttr("margin", "(scalar, default 0) Margin for MarginRankLossOp.") + .SetDefault(static_cast(0)); AddComment(R"DOC( +MarginRankLoss Operator. -MarginRankLoss operator measures the loss given a pair of training sample +This operator measures the loss given a pair of training sample {`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` -indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss -turns out +indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss +is calculated as: -loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin). +$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$ -The attribute `margin` involved here helps make the predictions more robust. +The attribute `margin` here helps make the predictions more robust. Denote the item ranked higher as the positive sample, otherwise the negative sample. If the score of the two samples satisfies -positive sample - negative sample < margin, +$positive sample - negative sample < margin$ -the pair of samples will contribute to the final loss, which will backpropogate -and train the ranking model to enlarge the difference of the two score. +the pair of samples will contribute to the final loss, which will backpropagate +and train the ranking model to enlarge the difference between the two scores. For batch input with size `batch_size`, `X1`, `X2` and `Label` all have the same shape [batch_size x 1]. diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc index 5ecbee3b413617e3a5523d9a32e72bc08bd316c5..5a1a6154203d40186f1e41491194b19612931b1f 100644 --- a/paddle/operators/matmul_op.cc +++ b/paddle/operators/matmul_op.cc @@ -144,7 +144,10 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { )DOC") .SetDefault(false); AddComment(R"DOC( -The MatMul operator is used to perform (batched) matrix multiplication +MatMul Operator. + + +This operator is used to perform (batched) matrix multiplication over the last two dimensions of the input tensors `X` and `Y`. If a transpose flag is specified, the last two dimensions of the @@ -166,7 +169,8 @@ The differences are: - We add `transpose_X` and `transpose_Y` flags. Both the input `X` and `Y` can carry the LoD (Level of Details) information, -or not. But the output only shares the LoD with input `X`. +or not. But the output only shares the LoD information with input `X`. + )DOC"); } }; diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc index 7caa1c9d0cf4dba33a206c85bcbed1fb1cb4e010..78b4bbca84d4670aba73222f1d679604d7516b02 100644 --- a/paddle/operators/mean_op.cc +++ b/paddle/operators/mean_op.cc @@ -36,7 +36,11 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input of mean op"); AddOutput("Out", "The output of mean op"); - AddComment(R"DOC( Mean Operator + AddComment(R"DOC( +Mean Operator. + +Out is a scalar which is the mean of all elements in X. + )DOC"); } }; diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc index f7943e99acc5975d077f2319b6f678cfc693c1f3..4684c20208501a3239fd57b35428946bb52af4a0 100644 --- a/paddle/operators/minus_op.cc +++ b/paddle/operators/minus_op.cc @@ -52,14 +52,16 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Y", "The right tensor of minus operator."); AddOutput("Out", "The output tensor of minus operator."); - AddComment(R"DOC(Minus Operator + AddComment(R"DOC( +Minus Operator. Equation: - Out = X - Y + $Out = X - Y$ Both the input `X` and `Y` can carry the LoD (Level of Details) information, -or not. But the output only shares the LoD with input `X`. +or not. But the output only shares the LoD information with input `X`. + )DOC"); } }; diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc index 7b9e9528952d552a69ffe6a628672901c5c1a7fd..28528848af1f467bf38be53f9d05fee6ca3f93cc 100644 --- a/paddle/operators/modified_huber_loss_op.cc +++ b/paddle/operators/modified_huber_loss_op.cc @@ -43,27 +43,35 @@ class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", - "The input tensor of modified huber loss op." + "The input tensor of modified huber loss op. " "X is 2-D tensor with shape [batch_size, 1]."); AddInput("Y", - "The target labels of modified huber loss op." - "The shape of Y is same as X. Values of Y must be 0 or 1."); + "The target labels of modified huber loss op. " + "The shape of Y is the same as X. Values of Y must be 0 or 1."); AddOutput("IntermediateVal", "Variable to save intermediate result which will be reused in " "backward processing.") .AsIntermediate(); AddOutput("Out", "Classification loss for X."); AddComment(R"DOC( -Modified huber loss is used in binary classification problem. The shape of -input X and target Y are both [N, 1] and so is the shape of output loss. -Since target Y is not differentiable, cacluating gradient for Y is illegal. -The formulation of modified huber loss is: - -L(y, f(x)) = max(0, 1 - yf(x))^2 for yf(x) >= -1, - -4yf(x) otherwise. - -Make sure the values of target label Y are in {0, 1} here. The operator will +Modified Huber Loss Operator. + +This operator is used in binary classification problem. The shape of +input X and target Y are both [N, 1] and so is the shape of the output loss. +Since target Y is not differentiable, calculating gradient for Y is illegal. +The formula of modified huber loss is: + +$$ +L(y, f(x)) = +\begin{cases} +(\max(0, 1 - yf(x)))^2, \text{if} \ yf(x) >= -1 \\ + -4yf(x), \quad \text{otherwise} +\end{cases} +$$ + +Make sure the values of target label Y are in {0, 1} here. This operator will scale values of Y to {-1, +1} when computing losses and gradients. + )DOC"); } }; diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc index 2d4d6f13720f0e6888edbddcb3243116506227ba..e8ce16f4cfcf83fd13e4d3a5318a4ae0c8c8449c 100644 --- a/paddle/operators/momentum_op.cc +++ b/paddle/operators/momentum_op.cc @@ -75,17 +75,23 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("VelocityOut", "(Tensor) Output updated velocity"); AddAttr("mu", "(float) Momentum coefficient"); - AddAttr("useNesterov", "(bool) Use Nesterov Momentum") + AddAttr("useNesterov", + "(bool, default false) " + "Use Nesterov Momentum") .SetDefault(false); AddComment(R"DOC( - -Momentum Algorithm with a flag for Nestrov Moemntum (momentum). - -velocity = mu * velocity + gradient -if (use_nesterov): - param = param - gradient * learning_rate + mu * velocity * learning_rate -else: - param = param - learning_rate * velocity +Momentum Optimizer. + +This optimizer has a flag for Nestrov Momentum. +The update equations are as follows: + +$$ +velocity = mu * velocity + gradient \\ +if (use\_nesterov): \\ + param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\ +else: \\ + param = param - learning\_rate * velocity. \\ +$$ )DOC"); } diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 90acf034d905e6ab3ba7bf8c3d29e1ef1161ed0c..3c39ae10dc50084cff284c307167c33c9208a3ce 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -78,6 +78,7 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "The output of mul op"); AddAttr( "x_num_col_dims", + "(int, default 1) " R"DOC(mul_op can take tensors with more than two dimensions as input `X`, in that case, tensors will be reshaped to a matrix. The matrix's first dimension(column length) will be the product of tensor's last @@ -88,20 +89,24 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker { .EqualGreaterThan(1); AddAttr( "y_num_col_dims", + "(int, default 1) " R"DOC(mul_op can take tensors with more than two dimensions as input `Y`, in that case, tensors will be reshaped to a matrix. Just like input `X`. )DOC") .SetDefault(1) .EqualGreaterThan(1); AddComment(R"DOC( -Mul operator is used to perform matrix multiplication for input X and Y. +Mul Operator. + +This operator is used to perform matrix multiplication for input X and Y. The equation is: - Out = X * Y + $$Out = X * Y$$ Both the input `X` and `Y` can carry the LoD (Level of Details) information, -or not. But the output only shares the LoD with input `X`. +or not. But the output only shares the LoD information with input `X`. + )DOC"); } }; diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index 4d86769026e4b3e3040bdcb3bc6dc2edea58b4b0..234fddcfd55ccc66f6378689dbc426499474b11f 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -66,7 +66,8 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The candidate tensors of multiplex operator.") .AsDuplicable(); AddOutput("Out", "The output tensor of multiplex operator."); - AddComment(R"DOC(Multiplex operator + AddComment(R"DOC( +Multiplex Operator. Multiplex multiple tensors according to the index provided by the index tensor. @@ -77,10 +78,11 @@ the (Ids[i])-th tensor. For i-th row of the output tensor: -y[i] = x_{k}[i] +$$y[i] = x_{k}[i]$$ -where y is the output tensor. `x_{k}` is the k-th input tensor +where `y` is the output tensor, `x_{k}` is the k-th input tensor, and `k = Ids[i]`. + )DOC"); } };