diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc index 5b45c42f837382353b7252f696bd882916cdd3fa..5657349d02d05b38fe0d84c724e6f63754cec52c 100644 --- a/paddle/fluid/operators/dgc_op.cc +++ b/paddle/fluid/operators/dgc_op.cc @@ -29,6 +29,9 @@ class DGCOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null."); PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(Grad) of DGCop should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Param"), true, + platform::errors::NotFound("Input(Param) of DGCop is not found.")); PADDLE_ENFORCE(ctx->HasInput("current_step"), "Input(current_step) of DGCop should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true, @@ -66,6 +69,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("U", "(Tensor) U velocity tensor of DGC"); AddInput("V", "(Tensor) V velocity tensor of DGC"); AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Param", "(Tensor) Input parameter"); AddInput("current_step", "(Tensor) Current step."); AddInput("nranks", "(Tensor) nranks."); @@ -99,6 +103,16 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker { "(float, 0.0)" "The period when begin k_select."); + AddAttr("regular_coeff", + "(float, 0.0)" + "The coeff of regularization, weight decay parameter") + .SetDefault(0.0); + + AddAttr("regular_type", + "(int, 0)" + "The type of regularization, {0:None, 1:L1Decay, 2:L2Decay") + .SetDefault(0); + AddComment(R"DOC( Original paper is https://arxiv.org/abs/1712.01887 diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h index 32dffe65161f19f2afbc8a92e9834ca6dd34e2b0..65aaf47472dd2a3d4c3d65eab6b4d269f7182286 100644 --- a/paddle/fluid/operators/dgc_op.h +++ b/paddle/fluid/operators/dgc_op.h @@ -43,6 +43,8 @@ class DGCOpKernel : public framework::OpKernel { auto v = ctx.Input("V"); auto g = ctx.Input("Grad"); + auto grad_out = ctx.Output("Grad_out"); + // attrs float m = ctx.Attr("m"); bool use_nesterov = ctx.Attr("use_nesterov"); @@ -55,6 +57,39 @@ class DGCOpKernel : public framework::OpKernel { const int nranks = static_cast(*nranks_tensor->data()); PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1"); + // regularization + auto p = ctx.Input("Param"); + float regular_coeff = ctx.Attr("regular_coeff"); + int regular_type = ctx.Attr("regular_type"); + + auto p_e = framework::EigenVector::Flatten(*p); + auto g_e = framework::EigenVector::Flatten(*g); + auto grad_out_e = framework::EigenVector::Flatten(*grad_out); + + auto& dev_ctx = ctx.template device_context(); + auto& eigen_ctx = *dev_ctx.eigen_device(); + + // NOTE. In paddle, loss has divided by nranks. Because dgc_op is before + // allreduce, so local regular_coeff need div nranks too. But now we + // multi grad with nranks in dgc_op, in that case regular_coeff don't + // need to /nranks, can prevent precision loss. For coeff often equal + // with 1e-4, if nranks=32, coeff/nranks will be 3.125e-6, the numerical + // accuracy of coeff/nranks will be too low. + PADDLE_ENFORCE_EQ(regular_type >= 0 && regular_type <= 2, true, + platform::errors::InvalidArgument( + "DGC only support one of None|L1Decay|L2Decay " + "Regularization for now.")); + if (regular_type == 0) { + grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e; + } else if (regular_type == 1) { + // L1Decay. grad = grad + coeff * sign(param) + grad_out_e.device(eigen_ctx) = + (1.0 * nranks) * g_e + regular_coeff * p_e.sign(); + } else if (regular_type == 2) { + // L2Decay. grad = grad + coeff * param + grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e + regular_coeff * p_e; + } + // current step auto current_step_tensor = ctx.Input("current_step"); const float* current_step = current_step_tensor->data(); @@ -91,19 +126,17 @@ class DGCOpKernel : public framework::OpKernel { // FIXME(gongwb): use cublas. auto u_out_e = framework::EigenVector::Flatten(*u_out); auto u_e = framework::EigenVector::Flatten(*u); - auto g_e = framework::EigenVector::Flatten(*g); - auto& dev_ctx = ctx.template device_context(); - auto& eigen_ctx = *dev_ctx.eigen_device(); - if (static_cast(*current_step) == - static_cast(rampup_begin_step)) { - // calc local momentum from global momentum - u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e; - } + // calc local momentum from global momentum + // NOTE. If grad not multi nranks, need add below code. + // if (static_cast(*current_step) == + // static_cast(rampup_begin_step)) { + // u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e; + // } if (use_nesterov) { // u = m * (u + g) - u_out_e.device(eigen_ctx) = m * (u_e + g_e); + u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e); // v = u + v + g ElementwiseComputeEx, DeviceContext, T>( @@ -113,7 +146,7 @@ class DGCOpKernel : public framework::OpKernel { ctx, g, v, 0, AddFunctor(), v_out); } else { // u = m * u + g - u_out_e.device(eigen_ctx) = m * u_e + g_e; + u_out_e.device(eigen_ctx) = m * u_e + grad_out_e; // v = u + v ElementwiseComputeEx, DeviceContext, T>( @@ -138,7 +171,6 @@ class DGCOpKernel : public framework::OpKernel { LOG(FATAL) << "v_out numel:" << v_out->numel(); } - auto grad_out = ctx.Output("Grad_out"); math::SetConstant tset; tset(dev_ctx, grad_out, static_cast(0)); } diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc index 6e0e2ffba4843e8c4b70af45a5f32274fdbff2dc..92ce600f22b64f82a053233dbd130adefca964fa 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc @@ -27,13 +27,20 @@ class DGCMomentumOp : public MomentumOp { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true, "current_step should be set."); + PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true, + platform::errors::NotFound( + "Input(nranks) of DGCMomentumOp is not found.")); + + PADDLE_ENFORCE_EQ(ctx->HasOutput("Grad_out"), true, + platform::errors::NotFound( + "Output(Grad_out) of DGCMomentumOp is not found.")); return MomentumOp::InferShape(ctx); } framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { - if (var_name == "current_step") { + if (var_name == "current_step" || var_name == "nranks") { VLOG(10) << "var_name:" << var_name << " need not to transform"; return expected_kernel_type; } @@ -47,6 +54,10 @@ class DGCMomentumOpMaker : public MomentumOpMaker { public: void Make() override { AddInput("current_step", "(Tensor) Current step."); + AddInput("nranks", "(Tensor) The number of trainers."); + + AddOutput("Grad_out", "(Tensor) Output grad gradient"); + AddAttr("rampup_begin_step", "(float, -1.0)" "The period when begin DGC.") diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h index 76db842f614fbd81f46b04ffec355ca91645bd8b..bea019f1f36e2ea21890f23b753b4df1d62c0e3b 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h @@ -38,6 +38,26 @@ class DGCMomentumKernel : public framework::OpKernel { auto current_step_tensor = context.Input("current_step"); auto* current_step = current_step_tensor->data(); + // nranks + auto nranks_tensor = context.Input("nranks"); + const int nranks = static_cast(*nranks_tensor->data()); + PADDLE_ENFORCE_GT( + nranks, 1, + platform::errors::InvalidArgument( + "DGC is not useful when num_trainers <= 1, but now nranks=%d", + nranks)); + + const framework::Tensor* g = context.Input("Grad"); + framework::Tensor* g_out = context.Output("Grad_out"); + auto g_e = framework::EigenVector::Flatten(*g); + auto g_out_e = framework::EigenVector::Flatten(*g_out); + + auto& dev_ctx = context.template device_context(); + auto& eigen_ctx = *dev_ctx.eigen_device(); + + // NOTE. In dgc_op we multi grad with nranks, so we need /nranks here. + g_out_e.device(eigen_ctx) = (1.0 / nranks) * g_e; + VLOG(10) << "current_step:" << *current_step << ", rampup_begin_step:" << rampup_begin_step; diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index aa1b713534aab75238cc789771f551708027cea0..907d511ff3667734b4711bd8e8e51bfeb2f5ef72 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -966,6 +966,22 @@ class DGCMomentumOptimizer(Optimizer): self._clip_norm = local_grad_clip_norm / (num_trainers * num_trainers) + self._get_dgc_regularization_param() + + def _get_dgc_regularization_param(self): + self.regular_coeff = 0.0 + self.regular_type = 0 + + if self.regularization is not None: + self.regular_coeff = self.regularization._regularization_coeff + from .regularizer import L1Decay, L2Decay + if isinstance(self.regularization, L1Decay): + self.regular_type = 1 + elif isinstance(self.regularization, L2Decay): + self.regular_type = 2 + else: + assert False, 'regularization must be None|L1Decay|L2Deacy' + def _is_use_dgc(self, param_var, grad_var): var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) if var_numel < 16384 or \ @@ -997,7 +1013,11 @@ class DGCMomentumOptimizer(Optimizer): type = "momentum" else: type = "dgc_momentum" - inputs.update({"current_step": self._global_step_var}) + inputs.update({ + "current_step": self._global_step_var, + "nranks": self._nranks_var + }) + outputs.update({'Grad_out': param_and_grad[1]}) attrs.update({"rampup_begin_step": float(self._rampup_begin_step)}) # create the dgc momentum optimize op @@ -1160,12 +1180,14 @@ class DGCMomentumOptimizer(Optimizer): encoded_var, gather_var): block = framework.default_main_program().global_block() op_maker = core.op_proto_and_checker_maker + dgc_op = block.append_op( type="dgc", inputs={ "U": u_var, "V": v_var, "Grad": clip_var, + "Param": param_var, "current_step": self._global_step_var, "nranks": self._nranks_var, }, @@ -1183,6 +1205,8 @@ class DGCMomentumOptimizer(Optimizer): "use_nesterov": self._use_nesterov, "rampup_begin_step": float(self._rampup_begin_step), "rampup_step": float(self._rampup_step), + "regular_coeff": float(self.regular_coeff), + "regular_type": int(self.regular_type), }, stop_gradient=True) @@ -1191,6 +1215,37 @@ class DGCMomentumOptimizer(Optimizer): dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), [param_var.name, grad_var.name]) + def apply_gradients(self, params_grads): + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + params_grads, table_param_and_grad, table_optimize_op = \ + self._process_distribute_lookuptable(params_grads) + + not_dgc_params_grads = [] + dgc_params_grads = [] + for param, grad in params_grads: + if not self._is_use_dgc(param, grad): + not_dgc_params_grads.append((param, grad)) + else: + dgc_params_grads.append((param, grad)) + + # DGC clip and regularization in local + not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads) + + # Add regularization if any + not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads, + self.regularization) + + params_grads = not_dgc_params_grads + dgc_params_grads + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + optimize_ops = self._create_optimization_pass(params_grads) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) + + return optimize_ops + class LarsMomentumOptimizer(Optimizer): """ diff --git a/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py b/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py index 33f3c6e941ac937e521f2811a2bbf23a0f7fe63f..39558d95a6e0cff8cd2c14c8b017f2955cd18d49 100644 --- a/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py @@ -34,16 +34,19 @@ class TestDGCMomentumOp1(unittest.TestCase): self.op_type = "dgc_momentum" self.dtype = np.float32 + nranks_val = 2 param = np.random.random((123, 321)).astype(self.dtype) grad = np.random.random((123, 321)).astype(self.dtype) velocity = np.zeros((123, 321)).astype(self.dtype) learning_rate = np.array([0.001]).astype(self.dtype) current_step = np.full((1), step).astype("float32") + nranks = np.full((1), nranks_val).astype("float32") mu = 0.0001 use_nesterov = False rampup_begin_step = 10.0 + # get tensor self.param_name, self.param_tensor = self.get_tensor('Param', param) self.grad_name, self.grad_tensor = self.get_tensor('Grad', grad) self.velocity_name, self.velocity_tensor = self.get_tensor('Velocity', @@ -52,6 +55,8 @@ class TestDGCMomentumOp1(unittest.TestCase): 'LearningRate', learning_rate) self.current_step_name, self.current_step_tensor = self.get_tensor( 'current_step', current_step, core.CPUPlace()) + self.nranks_name, self.nranks_tensor = self.get_tensor('nranks', nranks, + core.CPUPlace()) self.kwargs = { # inputs @@ -60,6 +65,7 @@ class TestDGCMomentumOp1(unittest.TestCase): 'Velocity': self.velocity_name, 'LearningRate': self.learning_rate_name, 'current_step': self.current_step_name, + 'nranks': self.nranks_name, # attrs 'mu': mu, @@ -68,17 +74,18 @@ class TestDGCMomentumOp1(unittest.TestCase): # outputs 'ParamOut': self.param_name, - 'VelocityOut': self.velocity_name + 'VelocityOut': self.velocity_name, + 'Grad_out': self.grad_name, } - velocity_out = mu * velocity + grad + velocity_out = mu * velocity + grad / nranks if use_nesterov: param_out = param - grad * learning_rate - \ velocity_out * mu * learning_rate else: param_out = param - learning_rate * velocity_out - sgd_out = param - learning_rate * grad + sgd_out = param - learning_rate * grad / nranks self.outputs = { 'ParamOut': param_out, diff --git a/python/paddle/fluid/tests/unittests/test_dgc_op.py b/python/paddle/fluid/tests/unittests/test_dgc_op.py index 31898b03424f7692a11d9273f8f0730199c94082..634fd64bc72c6e12d1f24bc97f5f154eab7d9d6d 100644 --- a/python/paddle/fluid/tests/unittests/test_dgc_op.py +++ b/python/paddle/fluid/tests/unittests/test_dgc_op.py @@ -44,6 +44,9 @@ class TestDGCOp(unittest.TestCase): self.grad_name = "Grad" self.grad = np.random.random(size).astype("float32") + self.param_name = "Param" + self.param = np.random.random(size).astype("float32") + self.current_step_name = "current_step" self.current_step = np.full((1), 0.0).astype("float32") @@ -66,6 +69,9 @@ class TestDGCOp(unittest.TestCase): self.grad_tensor = self.scope.var(self.grad_name).get_tensor() self.grad_tensor.set(self.grad, place) + self.param_tensor = self.scope.var(self.param_name).get_tensor() + self.param_tensor.set(self.param, place) + self.current_step_tensor = self.scope.var( self.current_step_name).get_tensor() self.current_step_tensor.set(self.current_step, core.CPUPlace()) @@ -96,6 +102,7 @@ class TestDGCOp(unittest.TestCase): 'U': self.u_name, 'V': self.v_name, 'Grad': self.grad_name, + 'Param': self.param_name, 'current_step': self.current_step_name, 'nranks': self.nranks_name, @@ -113,6 +120,8 @@ class TestDGCOp(unittest.TestCase): 'use_nesterov': True, 'rampup_begin_step': float(0.0), 'rampup_step': float(10.0), + 'regular_coeff': float(1e-4), + 'regular_type': int(2), } dgc_op = Operator('dgc', **kwargs) diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py index 3ddd6fdb6e9136714617f58851d430432cd3a623..996b6ae6ea4561922ae5c2acab4a9760dd73e88b 100644 --- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py @@ -18,6 +18,7 @@ import unittest import paddle.fluid.framework as framework import paddle.fluid.optimizer as optimizer +import paddle.fluid.regularizer as regularizer import paddle.compat as cpt from paddle.fluid.backward import append_backward from paddle.fluid.transpiler.details import program_to_code @@ -31,7 +32,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase): def get_velocity_str(self): return self._u_velocity_acc_str - def check_dgc_momentum_optimizer(self, dims=[5, 10, 8], name="momentum"): + def check_dgc_momentum_optimizer(self, + dims=[5, 10, 8], + name="momentum", + regularization=None): init_program = framework.Program() program = framework.Program() block = program.global_block() @@ -58,8 +62,12 @@ class TestDGCMomentumOptimizer(unittest.TestCase): outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) learning_rate = 0.01 + dgc_momentum_optimizer = self.MockDGCMomentum( - learning_rate=learning_rate, momentum=0.2, rampup_begin_step=0) + learning_rate=learning_rate, + momentum=0.2, + rampup_begin_step=0, + regularization=regularization) mean_out = block.create_var( dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( @@ -96,12 +104,15 @@ class TestDGCMomentumOptimizer(unittest.TestCase): program_to_code(program, fout=f) def test_momentum_without_dgc(self): - self.check_dgc_momentum_optimizer() + self.check_dgc_momentum_optimizer( + regularization=regularizer.L1Decay(1e-4)) def test_momentum_with_dgc(self): # 16 * 1024 = 16384, use dgc momentum self.check_dgc_momentum_optimizer( - dims=[16, 1024, 8], name="dgc_momentum") + dims=[16, 1024, 8], + name="dgc_momentum", + regularization=regularizer.L2Decay(1e-4)) if __name__ == '__main__':