提交 30a85204 编写于 作者: K kavyasrinet 提交者: Yi Wang

Adding the doc format for AdaDelta, AdaMax, Adam, AdaGrad, BatchNorm, Clip, Cast and AUC (#5317)

* Adding the doc format for AdaDelta

* Updating the documentation for Adagrad, Adam and Adamax

* Updating the auc op

* Fix review comments

* Updating doc for Batch Norm

* Updating the cast op

* Updating the clip op

* Fixing review comment

* Fixing review comment:

* Small change to restart PR_CI
上级 2ac5d7d0
......@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient");
AddInput("AvgSquaredGrad",
"(Tensor) Input expectation of squared gradient");
AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
AddInput("AvgSquaredUpdate",
"(Tensor) Input expectation of squared parameter updates");
"(Tensor) Input average of squared parameter updates");
AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("AvgSquaredGradOut",
"(Tensor) Output expectation of squared gradient");
"(Tensor) Output average of squared gradient");
AddOutput("AvgSquaredUpdateOut",
"(Tensor) Output expectation of squared parameter updates");
"(Tensor) Output average of squared parameter updates");
AddAttr<float>("rho",
"(float, default 0.95) Exponential decay rate "
......@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
"numerical stability")
.SetDefault(1.0e-6f);
AddComment(R"DOC(
Adadelta Updates Operator.
Adadelta Optimizer.
This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
adaptive learning rate method for gradient descent.
Adadelta optimizer is implemented as explained in:
https://arxiv.org/abs/1212.5701
Adadelta is a per-dimension adaptive learning rate method used
for gradient descent.
Adadelta updates:
Adadelta updates are as follows:
avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
param_update = - sqrt((avg_squared_update + epsilon) /
(avg_squared_grad_out + epsilon)) * grad
avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
param_out = param + param_update
References:
[1] ADADELTA: An Adaptive Learning Rate Method
https://arxiv.org/abs/1212.5701
$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
paramUpdate = - $\sqrt{((avgSquaredUpdate + \epsilon) /
(avgSquaredGrad_out + \epsilon))}$ * grad \break
avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
{(paramUpdate)}^2 \break
paramOut = param + paramUpdate$$
)DOC");
}
......
......@@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
Adaptive Gradient Algorithm (Adagrad).
moment_out = moment + grad * grad
param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
The update is done as follows:
$$momentOut = moment + grad * grad \break
paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
$$
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
does not have the epsilon attribute. It is added here for numerical stability
by avoiding division by zero.
does not have the epsilon attribute. It is added here in our implementation
as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
for numerical stability to avoid the division by zero error.
)DOC");
}
......
......@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension");
auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension");
PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
"Beta2 power accumulator should have 1 dimension");
auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
......@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
"Param and Grad input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"),
"Param and Moment input of AdamOp should have same dimension");
"Param and Moment1 input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment2"),
"Param and InfNorm input of AdamOp should have same dimension");
"Param and Moment2 input of AdamOp should have same dimension");
ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("Moment1Out", param_dims);
......@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(1.0e-8f);
AddComment(R"DOC(
Adam Updates Operator.
Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam
paper[1]. Adam is a first-order gradient-based optimization
method based on adaptive estimates of lower-order moments.
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates:
moment1_out = beta1 * moment1 + (1 − beta1) * grad
moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
learning_rate_t = learning_rate_t *
sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
learningRate = learningRate *
$\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
)DOC");
}
......
......@@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddComment(R"DOC(
Adamax Updates Operator.
Adamax Optimizer.
This implements the Adamax optimizer from Section 7 of the Adam
paper[1]. Adamax is a variant of the
We implement the Adamax optimizer from Section 7 of the Adam
paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
Adam algorithm based on the infinity norm.
Adamax updates:
moment_out = beta1 * moment + (1 - beta1) * grad
inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
learning_rate_t = learning_rate/(1 - beta1_pow)
param_out = param - learning_rate_t * moment_out/inf_norm_out
$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
learningRate = learningRate /(1 - \beta_1_{pow}) \break
paramOut = param - learningRate * momentPut / infNormOut$$
The original paper does not have an epsilon attribute.
However, it is added here for numerical stability
by preventing divide by 0.
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
However, it is added here for numerical stability to prevent the
division by 0 error.
)DOC");
}
......
......@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Indices"),
"Input of Indices must be initialized.");
"Input of Indices should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input of Label must be initialized.");
"Input of Label should not be null.");
auto inference_height = ctx->GetInputDim("Out")[0];
auto label_height = ctx->GetInputDim("Label")[0];
......@@ -52,20 +52,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Out",
"A floating point 2D tensor, values are in the range [0, 1]."
"Each row is descend sorted. This input should be the"
"Each row is sorted in descending order. This input should be the"
"output of topk."
"Typically, this tensor indicates the probability of each label");
AddInput("Indices",
"An int 2D tensor, indicating the indices of original"
"tensor before sort. Typically, this tensor indicates which label"
"the probability stands for.");
"tensor before sorting. Typically, this tensor indicates which "
"label the probability stands for.");
AddInput("Label",
"A 2D int tensor indicating the label of the training data."
"The height is batch size and width is always 1.");
// TODO(typhoonzero): support weight input
AddOutput("AUC",
"A scalar representing the "
"current area-under-curve.");
"current area-under-the-curve.");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC");
......@@ -74,19 +74,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
" roc curve.")
.SetDefault(200);
AddComment(
R"DOC(Computes the AUC according forward output and label.
Best to use for binary classification evaluations.
AddComment(R"DOC(
Area Under The Curve (AUC) Operator.
This implementation computes the AUC according to forward output and label.
It is used very widely in binary classification evaluation. As a note:
If input label contains values other than 0 and 1, it will be cast
to bool.
You can find the definations here:
to bool. You can find the relevant definitions here:
https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
Possible curves are:
- ROC: Receiver operating characteristic
- PR: Precision Recall
There are two types of possible curves:
1. ROC: Receiver operating characteristic
2. PR: Precision Recall
)DOC");
}
};
......
......@@ -70,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"Input x must have 3 to 5 dimensions.");
"Input X must have 3 to 5 dimensions.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
......@@ -97,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "The input tensor");
AddInput("Scale",
"Scale is a 1-dimensional tensor of size C "
"to be applied to the output");
"that is applied to the output");
AddInput("Bias",
"Bias is a 1-dimensional tensor of size C "
"to be applied to the output");
"that is applied to the output");
AddInput("Mean",
"The global mean (for training) or the "
"The global mean (for training) or "
"estimated mean (for testing)");
AddInput("Variance",
"The global variance (for training) "
"or the estimated Variance (for testing)");
"or estimated Variance (for testing)");
AddOutput("Y", "result after normalization");
AddOutput("MeanOut",
"Share memory with Mean. "
......@@ -123,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
"will apply to output when training")
.AsIntermediate();
AddComment(R"DOC(
https://arxiv.org/pdf/1502.03167.pdf
Batch Normalization.
NHWC `[batch, in_height, in_width, in_channels]`
NCHW `[batch, in_channels, in_height, in_width]`
Batch Norm has been implemented as discussed in the paper:
https://arxiv.org/pdf/1502.03167.pdf
Can be used as a normalizer function for conv2d and fully_connected operations.
The required data format for this layer is one of the following:
1. NHWC `[batch, in_height, in_width, in_channels]`
2. NCHW `[batch, in_channels, in_height, in_width]`
)DOC");
}
......
......@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
CastOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input tensor of cast op");
AddOutput("Out", "the output tensor of cast op");
AddComment(R"DOC(Cast operator.
cast the input tensor to other data type.
)DOC");
AddInput("X", "The input tensor of cast op");
AddOutput("Out", "The output tensor of cast op");
AddAttr<int>("out_data_type", "output data type");
AddAttr<int>("in_data_type", "input data type");
AddComment(R"DOC(
Cast Operator.
This Operator casts the input tensor to another data type and
returns tha Output Tensor.
)DOC");
}
};
......
......@@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>(
"max", "(float)Maximum value, above which element is replaced by max");
AddComment(R"DOC(
Clip operator limits the given input within an interval. The interval is
Clip Operator.
The clip operator limits the value of given input within an interval. The interval is
specified with arguments 'min' and 'max'.
)DOC");
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册