From 4d805e6a290ecfd93b25e34414d8e536d3ec486f Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 3 Jun 2021 12:03:17 +0800 Subject: [PATCH] multi pricison for lars op and lars optimizer (#33280) --- .../operators/optimizers/lars_momentum_op.cc | 14 +++ .../operators/optimizers/lars_momentum_op.cu | 119 +++++++++++++----- .../fluid/operators/optimizers/momentum_op.h | 3 + .../tests/test_multi_precision_fp16_train.py | 22 ++-- python/paddle/fluid/optimizer.py | 110 +++++++++++++--- .../fluid/tests/unittests/test_momentum_op.py | 58 +++++++++ 6 files changed, 271 insertions(+), 55 deletions(-) mode change 100755 => 100644 paddle/fluid/operators/optimizers/lars_momentum_op.cc diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc old mode 100755 new mode 100644 index 479f9643749..8f30dd5b2e6 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("LearningRate", "(LoDTensor, default LoDTensor) " "Input learning rate"); + AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(LoDTensor) This output is updated parameter. " @@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("VelocityOut", "(LoDTensor) This output is updated velocity. " "It shared memory with Input(Velocity)."); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable(); AddAttr("mu", "(float) Momentum coefficient"); AddAttr("lars_coeff", "(float, default 0.001) LARS coefficient.") @@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("epsilon", "(float, default 0.0) epsilon to avoid Division by Zero.") .SetDefault(0.0); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); + AddAttr( + "rescale_grad", + "(float, default 1.0) Multiply the gradient with `rescale_grad`" + "before updating. Often choose to be `1.0/batch_size`.") + .SetDefault(1.0f); AddComment(R"DOC( Lars Momentum Optimizer. diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index eb0111ae4de..42477232e7c 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -13,36 +13,64 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" namespace paddle { namespace operators { template -__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v, - const T* learning_rate, const T mu, - const int64_t num, const T lars_coeff, - const T lars_weight_decay, const T* p_norm, - const T* g_norm, T* p_out, T* v_out, - const T epsilon) { - T lr = learning_rate[0]; - T local_lr = learning_rate[0]; +using MultiPrecisionType = typename details::MPTypeTrait::Type; + +template +__global__ void MomentumLarsKernel( + const T* p, const T* g, const MT* v, + const MultiPrecisionType* learning_rate, const MT mu, const int64_t num, + const MT lars_coeff, const MT lars_weight_decay, + const MultiPrecisionType* p_norm, const MultiPrecisionType* g_norm, + T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out, + const MultiPrecisionType rescale_grad) { + const MT lr = static_cast(learning_rate[0]); + MT local_lr = lr; + const MT p_n = static_cast(p_norm[0]); + const MT g_n = static_cast(g_norm[0]); + + if (lars_weight_decay > static_cast(0) && p_n > static_cast(0) && + g_n > static_cast(0)) { + local_lr = + lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon); + } CUDA_KERNEL_LOOP(i, num) { - if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) { - local_lr = lr * lars_coeff * p_norm[0] / - (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon); - } + MT grad = static_cast(g[i]) * static_cast(rescale_grad); + MT param = master_p ? master_p[i] : static_cast(p[i]); + + MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param); + MT p_new = param - v_new; - T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]); v_out[i] = v_new; - p_out[i] = p[i] - v_new; + p_out[i] = static_cast(p_new); + if (master_p_out) master_p_out[i] = p_new; } } template class LarsMomentumOpCUDAKernel : public framework::OpKernel { + using MPDType = MultiPrecisionType; + public: void Compute(const framework::ExecutionContext& ctx) const override { + const bool multi_precision = ctx.Attr("multi_precision"); + if (multi_precision) { + InnerCompute(ctx, multi_precision); + } else { + InnerCompute(ctx, multi_precision); + } + } + + private: + template + void InnerCompute(const framework::ExecutionContext& ctx, + const bool multi_precision) const { auto param_out = ctx.Output("ParamOut"); auto velocity_out = ctx.Output("VelocityOut"); auto param = ctx.Input("Param"); @@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { auto grad = ctx.Input("Grad"); auto learning_rate = ctx.Input("LearningRate"); + const framework::Tensor* master_param = nullptr; + framework::Tensor* master_param_out = nullptr; + if (multi_precision) { + bool has_master = + ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); + PADDLE_ENFORCE_EQ(has_master, true, + platform::errors::InvalidArgument( + "The Input(MasterParam) and Output(MasterParamOut) " + "should not be null when " + "the attr `multi_precision` is true")); + master_param = ctx.Input("MasterParam"); + master_param_out = ctx.Output("MasterParamOut"); + } + + const MT* master_p = multi_precision ? master_param->data() : nullptr; + MT* master_p_out = multi_precision + ? master_param_out->mutable_data(ctx.GetPlace()) + : nullptr; + T* p_out = param_out->mutable_data(ctx.GetPlace()); - T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + MT* v_out = velocity_out->mutable_data(ctx.GetPlace()); - T mu = static_cast(ctx.Attr("mu")); - T lars_coeff = ctx.Attr("lars_coeff"); - T lars_weight_decay = ctx.Attr("lars_weight_decay"); - T epsilon = ctx.Attr("epsilon"); + MT mu = static_cast(ctx.Attr("mu")); + MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); + MT lars_weight_decay = + static_cast(ctx.Attr("lars_weight_decay")); + MT epsilon = static_cast(ctx.Attr("epsilon")); + MPDType rescale_grad = + static_cast(ctx.Attr("rescale_grad")); auto* p = param->data(); - auto* v = velocity->data(); auto* g = grad->data(); - auto* lr = learning_rate->data(); + auto* v = velocity->data(); + auto* lr = learning_rate->data(); int block = 512; int grid = (param->numel() + block - 1) / block; @@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { framework::Tensor p_norm_t, g_norm_t; p_norm_t.Resize({1}); g_norm_t.Resize({1}); - auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); - auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); - auto ep_norm = framework::EigenScalar::From(p_norm_t); - auto eg_norm = framework::EigenScalar::From(g_norm_t); + auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); + auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); + auto ep_norm = framework::EigenScalar::From(p_norm_t); + auto eg_norm = framework::EigenScalar::From(g_norm_t); auto* place = ctx.template device_context().eigen_device(); - ep_norm.device(*place) = eigen_p.square().sum().sqrt(); - eg_norm.device(*place) = eigen_g.square().sum().sqrt(); - MomentumLarsKernel<<>>( + + // eigen unsupport fp16 l2-norm + ep_norm.device(*place) = + eigen_p.template cast().square().sum().sqrt(); + eg_norm.device(*place) = + (eigen_g.template cast() * rescale_grad).square().sum().sqrt(); + + MomentumLarsKernel< + T, MT><<>>( p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, - p_norm_data, g_norm_data, p_out, v_out, epsilon); + p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out, + rescale_grad); } }; @@ -93,4 +150,6 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( lars_momentum, ops::LarsMomentumOpCUDAKernel, - ops::LarsMomentumOpCUDAKernel); + ops::LarsMomentumOpCUDAKernel, + ops::LarsMomentumOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index cbb0704fa85..f461dec66c0 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim); + if (ctx->HasOutput("MasterParamOut")) { + ctx->SetOutputDim("MasterParamOut", param_dim); + } } framework::OpKernelType GetExpectedKernelType( diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py index 850b267411e..f43b45553f5 100644 --- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py +++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py @@ -73,7 +73,7 @@ def resnet_cifar10(input, depth=32): return pool -def train(use_pure_fp16=True, use_nesterov=False, use_adam=False): +def train(use_pure_fp16=True, use_nesterov=False, optimizer=""): classdim = 10 data_shape = [3, 32, 32] BATCH_SIZE = 32 @@ -96,12 +96,17 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False): # Test program test_program = train_program.clone(for_test=True) - if use_adam: + if optimizer == "Adam": optimizer = paddle.optimizer.AdamW( learning_rate=0.001, epsilon=1e-8, weight_decay=0.0, multi_precision=True) + elif optimizer == "Lars": + optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer( + learning_rate=0.001, + momentum=0.9, + multi_precision=use_pure_fp16) else: optimizer = paddle.optimizer.Momentum( learning_rate=0.001, @@ -169,9 +174,11 @@ class TestImageMultiPrecision(unittest.TestCase): if not fluid.core.is_compiled_with_cuda(): return - def do_test(use_nesterov=False, use_adam=False): - if use_adam: + def do_test(use_nesterov=False, optimizer=""): + if optimizer == "Adam": suffix = "use Adam" + elif optimizer == "Lars": + suffix = "use Lars" else: suffix = "with Nesterov" if use_nesterov else "without Nesterov" with self.scope_prog_guard(): @@ -180,14 +187,14 @@ class TestImageMultiPrecision(unittest.TestCase): train_loss_fp16, test_loss_fp16 = train( use_pure_fp16=True, use_nesterov=use_nesterov, - use_adam=use_adam) + optimizer=optimizer) with self.scope_prog_guard(): print("-----------------FP32 Train {}-----------------".format( suffix)) train_loss_fp32, test_loss_fp32 = train( use_pure_fp16=False, use_nesterov=use_nesterov, - use_adam=use_adam) + optimizer=optimizer) self.assertTrue( np.allclose( @@ -208,7 +215,8 @@ class TestImageMultiPrecision(unittest.TestCase): do_test(use_nesterov=False) do_test(use_nesterov=True) - do_test(use_adam=True) + do_test(optimizer="Adam") + do_test(optimizer="Lars") @contextlib.contextmanager def scope_prog_guard(self): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index c0b93c83f78..60d25a77c58 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1725,6 +1725,9 @@ class LarsMomentumOptimizer(Optimizer): For details, please refer to :ref:`api_guide_Name`. Default is None. exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None. epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0. + multi_precision (bool, optional): Whether to use multi-precision during weight updating. + rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \ + before updating. Often choose to be `1.0/batch_size`. Examples: .. code-block:: python @@ -1758,7 +1761,9 @@ class LarsMomentumOptimizer(Optimizer): grad_clip=None, name=None, exclude_from_weight_decay=None, - epsilon=0): + epsilon=0, + multi_precision=False, + rescale_grad=1.0): assert learning_rate is not None assert momentum is not None super(LarsMomentumOptimizer, self).__init__( @@ -1776,16 +1781,70 @@ class LarsMomentumOptimizer(Optimizer): self._exclude_from_weight_decay = [] else: self._exclude_from_weight_decay = exclude_from_weight_decay + self._multi_precision = multi_precision + self._rescale_grad = float(rescale_grad) + self._master_weights = {} + + def _create_master_weight(self, param): + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + '_fp32_master' + var_name = unique_name.generate(var_name) + var = layers.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32 + }) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + target_param = self._master_weights[ + param.name] if find_master else param + target_name = target_param.name + if (name not in self._accumulators or + target_name not in self._accumulators[name]): + raise Exception("Accumulator {} does not exist for parameter {}". + format(name, target_name)) + return self._accumulators[name][target_name] def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._velocity_acc_str, master_p) + continue + if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision: + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) self._add_accumulator(self._velocity_acc_str, p) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) - _lars_weight_decay = self._lars_weight_decay param_name = param_and_grad[0].name if len(self._exclude_from_weight_decay) > 0: @@ -1796,25 +1855,40 @@ class LarsMomentumOptimizer(Optimizer): velocity_acc = self._get_accumulator(self._velocity_acc_str, param_and_grad[0]) + lr = self._create_param_lr(param_and_grad) + + find_master = self._multi_precision and param_and_grad[ + 0].dtype == core.VarDesc.VarType.FP16 + master_weight = (self._master_weights[param_and_grad[0].name] + if find_master else None) + + attrs = { + "mu": self._momentum, + "lars_coeff": self._lars_coeff, + "lars_weight_decay": _lars_weight_decay, + "multi_precision": find_master, + "rescale_grad": self._rescale_grad + } + + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Velocity": velocity_acc, + "LearningRate": lr + } + + outputs = {"ParamOut": param_and_grad[0], "VelocityOut": velocity_acc} + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + # create the momentum optimize op momentum_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Velocity": velocity_acc, - "LearningRate": self._create_param_lr(param_and_grad) - }, - outputs={ - "ParamOut": param_and_grad[0], - "VelocityOut": velocity_acc - }, - attrs={ - "mu": self._momentum, - "lars_coeff": self._lars_coeff, - "lars_weight_decay": _lars_weight_decay, - "epsilon": self._epsilon - }, + inputs=inputs, + outputs=outputs, + attrs=attrs, stop_gradient=True) return momentum_op diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index ba4c1458c77..e31587b225e 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -134,6 +134,64 @@ class TestMomentumOp2(OpTest): self.check_output() +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestLarsMomentumOpWithMP(OpTest): + def setUp(self): + self.op_type = "lars_momentum" + + master_param = np.random.random((123, 321)).astype("float32") + param = master_param.astype("float16") + grad = np.random.random((123, 321)).astype("float16") + velocity = np.zeros((123, 321)).astype("float32") + learning_rate = np.array([0.001]).astype("float32") + mu = 0.0001 + lars_coeff = 0.001 + lars_weight_decay = 0.0005 + rescale_grad = 1.0 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Velocity': velocity, + 'LearningRate': learning_rate, + 'MasterParam': master_param, + } + + self.attrs = { + 'mu': mu, + 'lars_coeff': lars_coeff, + 'lars_weight_decay': lars_weight_decay, + 'multi_precision': True, + 'rescale_grad': rescale_grad + } + + fp32_grad = grad.astype("float32") + pnorm = np.sqrt(np.square(master_param).sum()) + gnorm = np.sqrt(np.square(fp32_grad).sum()) + local_lr = learning_rate * lars_coeff * pnorm / ( + gnorm + lars_weight_decay * pnorm) + fp32_grad = fp32_grad * rescale_grad + velocity_out = mu * velocity + local_lr * (fp32_grad + lars_weight_decay + * master_param) + p_new = master_param - velocity_out + param_out = p_new.astype("float16") + master_param_out = p_new + + self.outputs = { + 'ParamOut': param_out, + 'VelocityOut': velocity_out, + 'MasterParamOut': master_param_out + } + + def test_check_output(self): + paddle.enable_static() + if core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place) + + class TestLarsMomentumOp(OpTest): def setUp(self): self.op_type = "lars_momentum" -- GitLab