From 77cac5cdb882bc390fa854b22b1365e941b99731 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 19 Oct 2017 10:53:14 -0700 Subject: [PATCH] Removing updates of Beta1 power accumulators outside the op (#4931) --- paddle/operators/adamax_op.cc | 7 +--- paddle/operators/adamax_op.h | 7 +--- .../v2/framework/tests/test_adamax_op.py | 32 ++++++++----------- 3 files changed, 15 insertions(+), 31 deletions(-) diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index e848333ef8a..ff256577411 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -41,8 +41,6 @@ class AdamaxOp : public framework::OperatorWithKernel { "Output(MomentOut) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"), "Output(InfNormOut) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"), - "Output(Beta1PowOut) of AdamaxOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, @@ -64,7 +62,6 @@ class AdamaxOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims); ctx->SetOutputDim("InfNormOut", param_dims); - ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims); } }; @@ -86,7 +83,6 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("InfNormOut", "(Tensor) " "Output exponentially weighted infinity norm"); - AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator"); AddAttr("beta1", "(float, default 0.9) " @@ -113,8 +109,7 @@ Adamax updates: moment_out = beta1 * moment + (1 - beta1) * grad inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad)) -beta1_pow_out = beta1_pow * beta1 -learning_rate_t = learning_rate/(1 - beta1_pow_out) +learning_rate_t = learning_rate/(1 - beta1_pow) param_out = param - learning_rate_t * moment_out/inf_norm_out The original paper does not have an epsilon attribute. diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index 9677b1bb786..2c99832ec08 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -26,12 +26,10 @@ class AdamaxOpKernel : public framework::OpKernel { auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); auto inf_norm_out_tensor = ctx.Output("InfNormOut"); - auto beta1_pow_out_tensor = ctx.Output("Beta1PowOut"); param_out_tensor->mutable_data(ctx.GetPlace()); moment_out_tensor->mutable_data(ctx.GetPlace()); inf_norm_out_tensor->mutable_data(ctx.GetPlace()); - beta1_pow_out_tensor->mutable_data(ctx.GetPlace()); float beta1 = ctx.Attr("beta1"); float beta2 = ctx.Attr("beta2"); @@ -53,15 +51,12 @@ class AdamaxOpKernel : public framework::OpKernel { auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); auto inf_norm_out = framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto beta1_pow_out = - framework::EigenVector::Flatten(*beta1_pow_out_tensor); auto place = ctx.GetEigenDevice(); moment_out.device(place) = beta1 * moment + (1 - beta1) * grad; inf_norm_out.device(place) = grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); - beta1_pow_out.device(place) = beta1_pow * beta1; - auto lr_t = lr / (1 - beta1_pow_out); + auto lr_t = lr / (1 - beta1_pow); Eigen::DSizes m_dsize(moment_out_tensor->numel()); param_out.device(place) = param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py index af81075d6ad..8e5a15aa3d1 100644 --- a/python/paddle/v2/framework/tests/test_adamax_op.py +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -31,14 +31,13 @@ class TestAdamaxOp1(OpTest): self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} - param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( - self.inputs, self.attrs) + param_out, moment_out, inf_norm_out = adamax_step(self.inputs, + self.attrs) self.outputs = { 'ParamOut': param_out, 'MomentOut': moment_out, - 'InfNormOut': inf_norm_out, - 'Beta1PowOut': beta1_pow_out + 'InfNormOut': inf_norm_out } def test_check_output(self): @@ -73,14 +72,12 @@ class TestAdamaxOp2(OpTest): } attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} - param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( - self.inputs, attrs) + param_out, moment_out, inf_norm_out = adamax_step(self.inputs, attrs) self.outputs = { 'ParamOut': param_out, 'MomentOut': moment_out, - 'InfNormOut': inf_norm_out, - 'Beta1PowOut': beta1_pow_out + 'InfNormOut': inf_norm_out } def test_check_output(self): @@ -117,19 +114,15 @@ class TestAdamaxOpMultipleSteps(OpTest): self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} - param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( - self.inputs, self.attrs) - def test_check_output(self): for _ in range(self.num_steps): - param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( - self.inputs, self.attrs) + param_out, moment_out, inf_norm_out = adamax_step(self.inputs, + self.attrs) self.outputs = { 'ParamOut': param_out, 'MomentOut': moment_out, - 'InfNormOut': inf_norm_out, - 'Beta1PowOut': beta1_pow_out + 'InfNormOut': inf_norm_out } # Verify output for this step @@ -139,7 +132,9 @@ class TestAdamaxOpMultipleSteps(OpTest): self.inputs['Param'] = param_out self.inputs['Moment'] = moment_out self.inputs['InfNorm'] = inf_norm_out - self.inputs['Beta1Pow'] = beta1_pow_out + + # Update Beta1 Power accumulator for next step + self.inputs['Beta1Pow'] *= self.attrs['beta1'] # Randomize gradient for next step self.inputs['Grad'] = np.random.uniform( @@ -167,11 +162,10 @@ def adamax_step(inputs, attributes): moment_out = beta1 * moment + (1 - beta1) * grad inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad)) - beta1_pow_out = beta1_pow * beta1 - lr_t = (lr / (1 - beta1_pow_out)) + lr_t = (lr / (1 - beta1_pow)) param_out = param - lr_t * np.divide(moment_out, inf_norm_out) - return param_out, moment_out, inf_norm_out, beta1_pow_out + return param_out, moment_out, inf_norm_out if __name__ == "__main__": -- GitLab