提交 77cac5cd 编写于 作者: A Abhinav Arora 提交者: GitHub

Removing updates of Beta1 power accumulators outside the op (#4931)

上级 11bebeb2
......@@ -41,8 +41,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
"Output(MomentOut) of AdamaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
"Output(InfNormOut) of AdamaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
"Output(Beta1PowOut) of AdamaxOp should not be null.");
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
......@@ -64,7 +62,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("MomentOut", param_dims);
ctx->SetOutputDim("InfNormOut", param_dims);
ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
}
};
......@@ -86,7 +83,6 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("InfNormOut",
"(Tensor) "
"Output exponentially weighted infinity norm");
AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
AddAttr<float>("beta1",
"(float, default 0.9) "
......@@ -113,8 +109,7 @@ Adamax updates:
moment_out = beta1 * moment + (1 - beta1) * grad
inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
beta1_pow_out = beta1_pow * beta1
learning_rate_t = learning_rate/(1 - beta1_pow_out)
learning_rate_t = learning_rate/(1 - beta1_pow)
param_out = param - learning_rate_t * moment_out/inf_norm_out
The original paper does not have an epsilon attribute.
......
......@@ -26,12 +26,10 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
param_out_tensor->mutable_data<T>(ctx.GetPlace());
moment_out_tensor->mutable_data<T>(ctx.GetPlace());
inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
float beta1 = ctx.Attr<float>("beta1");
float beta2 = ctx.Attr<float>("beta2");
......@@ -53,15 +51,12 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
auto inf_norm_out =
framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
auto beta1_pow_out =
framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
auto place = ctx.GetEigenDevice<Place>();
moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
inf_norm_out.device(place) =
grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
beta1_pow_out.device(place) = beta1_pow * beta1;
auto lr_t = lr / (1 - beta1_pow_out);
auto lr_t = lr / (1 - beta1_pow);
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
param_out.device(place) =
param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
......
......@@ -31,14 +31,13 @@ class TestAdamaxOp1(OpTest):
self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
self.inputs, self.attrs)
param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
self.attrs)
self.outputs = {
'ParamOut': param_out,
'MomentOut': moment_out,
'InfNormOut': inf_norm_out,
'Beta1PowOut': beta1_pow_out
'InfNormOut': inf_norm_out
}
def test_check_output(self):
......@@ -73,14 +72,12 @@ class TestAdamaxOp2(OpTest):
}
attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
self.inputs, attrs)
param_out, moment_out, inf_norm_out = adamax_step(self.inputs, attrs)
self.outputs = {
'ParamOut': param_out,
'MomentOut': moment_out,
'InfNormOut': inf_norm_out,
'Beta1PowOut': beta1_pow_out
'InfNormOut': inf_norm_out
}
def test_check_output(self):
......@@ -117,19 +114,15 @@ class TestAdamaxOpMultipleSteps(OpTest):
self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
self.inputs, self.attrs)
def test_check_output(self):
for _ in range(self.num_steps):
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
self.inputs, self.attrs)
param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
self.attrs)
self.outputs = {
'ParamOut': param_out,
'MomentOut': moment_out,
'InfNormOut': inf_norm_out,
'Beta1PowOut': beta1_pow_out
'InfNormOut': inf_norm_out
}
# Verify output for this step
......@@ -139,7 +132,9 @@ class TestAdamaxOpMultipleSteps(OpTest):
self.inputs['Param'] = param_out
self.inputs['Moment'] = moment_out
self.inputs['InfNorm'] = inf_norm_out
self.inputs['Beta1Pow'] = beta1_pow_out
# Update Beta1 Power accumulator for next step
self.inputs['Beta1Pow'] *= self.attrs['beta1']
# Randomize gradient for next step
self.inputs['Grad'] = np.random.uniform(
......@@ -167,11 +162,10 @@ def adamax_step(inputs, attributes):
moment_out = beta1 * moment + (1 - beta1) * grad
inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad))
beta1_pow_out = beta1_pow * beta1
lr_t = (lr / (1 - beta1_pow_out))
lr_t = (lr / (1 - beta1_pow))
param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
return param_out, moment_out, inf_norm_out, beta1_pow_out
return param_out, moment_out, inf_norm_out
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册