提交 77cac5cd 编写于 作者: A Abhinav Arora 提交者: GitHub

Removing updates of Beta1 power accumulators outside the op (#4931)

上级 11bebeb2
...@@ -41,8 +41,6 @@ class AdamaxOp : public framework::OperatorWithKernel { ...@@ -41,8 +41,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
"Output(MomentOut) of AdamaxOp should not be null."); "Output(MomentOut) of AdamaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"), PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
"Output(InfNormOut) of AdamaxOp should not be null."); "Output(InfNormOut) of AdamaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
"Output(Beta1PowOut) of AdamaxOp should not be null.");
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
...@@ -64,7 +62,6 @@ class AdamaxOp : public framework::OperatorWithKernel { ...@@ -64,7 +62,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("MomentOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims);
ctx->SetOutputDim("InfNormOut", param_dims); ctx->SetOutputDim("InfNormOut", param_dims);
ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
} }
}; };
...@@ -86,7 +83,6 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -86,7 +83,6 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("InfNormOut", AddOutput("InfNormOut",
"(Tensor) " "(Tensor) "
"Output exponentially weighted infinity norm"); "Output exponentially weighted infinity norm");
AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
AddAttr<float>("beta1", AddAttr<float>("beta1",
"(float, default 0.9) " "(float, default 0.9) "
...@@ -113,8 +109,7 @@ Adamax updates: ...@@ -113,8 +109,7 @@ Adamax updates:
moment_out = beta1 * moment + (1 - beta1) * grad moment_out = beta1 * moment + (1 - beta1) * grad
inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad)) inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
beta1_pow_out = beta1_pow * beta1 learning_rate_t = learning_rate/(1 - beta1_pow)
learning_rate_t = learning_rate/(1 - beta1_pow_out)
param_out = param - learning_rate_t * moment_out/inf_norm_out param_out = param - learning_rate_t * moment_out/inf_norm_out
The original paper does not have an epsilon attribute. The original paper does not have an epsilon attribute.
......
...@@ -26,12 +26,10 @@ class AdamaxOpKernel : public framework::OpKernel<T> { ...@@ -26,12 +26,10 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut"); auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
param_out_tensor->mutable_data<T>(ctx.GetPlace()); param_out_tensor->mutable_data<T>(ctx.GetPlace());
moment_out_tensor->mutable_data<T>(ctx.GetPlace()); moment_out_tensor->mutable_data<T>(ctx.GetPlace());
inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace()); inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
float beta1 = ctx.Attr<float>("beta1"); float beta1 = ctx.Attr<float>("beta1");
float beta2 = ctx.Attr<float>("beta2"); float beta2 = ctx.Attr<float>("beta2");
...@@ -53,15 +51,12 @@ class AdamaxOpKernel : public framework::OpKernel<T> { ...@@ -53,15 +51,12 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor); auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
auto inf_norm_out = auto inf_norm_out =
framework::EigenVector<T>::Flatten(*inf_norm_out_tensor); framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
auto beta1_pow_out =
framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
auto place = ctx.GetEigenDevice<Place>(); auto place = ctx.GetEigenDevice<Place>();
moment_out.device(place) = beta1 * moment + (1 - beta1) * grad; moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
inf_norm_out.device(place) = inf_norm_out.device(place) =
grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
beta1_pow_out.device(place) = beta1_pow * beta1; auto lr_t = lr / (1 - beta1_pow);
auto lr_t = lr / (1 - beta1_pow_out);
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel()); Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
param_out.device(place) = param_out.device(place) =
param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
......
...@@ -31,14 +31,13 @@ class TestAdamaxOp1(OpTest): ...@@ -31,14 +31,13 @@ class TestAdamaxOp1(OpTest):
self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
self.inputs, self.attrs) self.attrs)
self.outputs = { self.outputs = {
'ParamOut': param_out, 'ParamOut': param_out,
'MomentOut': moment_out, 'MomentOut': moment_out,
'InfNormOut': inf_norm_out, 'InfNormOut': inf_norm_out
'Beta1PowOut': beta1_pow_out
} }
def test_check_output(self): def test_check_output(self):
...@@ -73,14 +72,12 @@ class TestAdamaxOp2(OpTest): ...@@ -73,14 +72,12 @@ class TestAdamaxOp2(OpTest):
} }
attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( param_out, moment_out, inf_norm_out = adamax_step(self.inputs, attrs)
self.inputs, attrs)
self.outputs = { self.outputs = {
'ParamOut': param_out, 'ParamOut': param_out,
'MomentOut': moment_out, 'MomentOut': moment_out,
'InfNormOut': inf_norm_out, 'InfNormOut': inf_norm_out
'Beta1PowOut': beta1_pow_out
} }
def test_check_output(self): def test_check_output(self):
...@@ -117,19 +114,15 @@ class TestAdamaxOpMultipleSteps(OpTest): ...@@ -117,19 +114,15 @@ class TestAdamaxOpMultipleSteps(OpTest):
self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
self.inputs, self.attrs)
def test_check_output(self): def test_check_output(self):
for _ in range(self.num_steps): for _ in range(self.num_steps):
param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
self.inputs, self.attrs) self.attrs)
self.outputs = { self.outputs = {
'ParamOut': param_out, 'ParamOut': param_out,
'MomentOut': moment_out, 'MomentOut': moment_out,
'InfNormOut': inf_norm_out, 'InfNormOut': inf_norm_out
'Beta1PowOut': beta1_pow_out
} }
# Verify output for this step # Verify output for this step
...@@ -139,7 +132,9 @@ class TestAdamaxOpMultipleSteps(OpTest): ...@@ -139,7 +132,9 @@ class TestAdamaxOpMultipleSteps(OpTest):
self.inputs['Param'] = param_out self.inputs['Param'] = param_out
self.inputs['Moment'] = moment_out self.inputs['Moment'] = moment_out
self.inputs['InfNorm'] = inf_norm_out self.inputs['InfNorm'] = inf_norm_out
self.inputs['Beta1Pow'] = beta1_pow_out
# Update Beta1 Power accumulator for next step
self.inputs['Beta1Pow'] *= self.attrs['beta1']
# Randomize gradient for next step # Randomize gradient for next step
self.inputs['Grad'] = np.random.uniform( self.inputs['Grad'] = np.random.uniform(
...@@ -167,11 +162,10 @@ def adamax_step(inputs, attributes): ...@@ -167,11 +162,10 @@ def adamax_step(inputs, attributes):
moment_out = beta1 * moment + (1 - beta1) * grad moment_out = beta1 * moment + (1 - beta1) * grad
inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad)) inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad))
beta1_pow_out = beta1_pow * beta1 lr_t = (lr / (1 - beta1_pow))
lr_t = (lr / (1 - beta1_pow_out))
param_out = param - lr_t * np.divide(moment_out, inf_norm_out) param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
return param_out, moment_out, inf_norm_out, beta1_pow_out return param_out, moment_out, inf_norm_out
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册