From a8a2b7f47d618351a38014cd2a2014044e1cba1c Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 6 Mar 2023 11:25:08 +0800 Subject: [PATCH] Add multiprecision for adadelta op (#50131) --- .../fluid/operators/optimizers/adadelta_op.cc | 9 + paddle/fluid/pybind/eager_generator.h | 13 +- paddle/phi/api/yaml/legacy_ops.yaml | 8 +- paddle/phi/infermeta/multiary.cc | 5 +- paddle/phi/infermeta/multiary.h | 5 +- paddle/phi/kernels/adadelta_kernel.h | 5 +- paddle/phi/kernels/gpu/adadelta_kernel.cu | 9 +- .../phi/kernels/impl/adadelta_kernel_impl.h | 43 ++- paddle/phi/kernels/xpu/adadelta_kernel.cc | 5 +- paddle/phi/ops/compat/adadelta_sig.cc | 36 +++ python/paddle/fluid/optimizer.py | 117 +++++++- .../fluid/tests/unittests/test_adadelta_op.py | 276 ++++++++++++++++++ python/paddle/optimizer/adadelta.py | 121 +++++++- 13 files changed, 607 insertions(+), 45 deletions(-) create mode 100644 paddle/phi/ops/compat/adadelta_sig.cc diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index aa78843724d..2df8ff971ce 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -39,12 +39,17 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient"); AddInput("AvgSquaredUpdate", "(Tensor) Input average of squared parameter updates"); + AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("AvgSquaredGradOut", "(Tensor) Output average of squared gradient"); AddOutput("AvgSquaredUpdateOut", "(Tensor) Output average of squared parameter updates"); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable(); AddAttr("rho", "(float, default 0.95) Exponential decay rate " @@ -54,6 +59,10 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { "(float, default 1.0e-6) Constant for " "numerical stability") .SetDefault(1.0e-6f); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); AddComment(R"DOC( Adadelta Optimizer. diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h index 05d62debc3d..0096e179529 100644 --- a/paddle/fluid/pybind/eager_generator.h +++ b/paddle/fluid/pybind/eager_generator.h @@ -206,6 +206,8 @@ std::map> op_ins_map = { {"Q", "K", "V", "Offset", "Columns", "KeyPaddingMask", "AttnMask"}}, {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}}, {"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}}, + {"adadelta", + {"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"}}, {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}}, {"nce", {"Input", @@ -311,6 +313,11 @@ std::map> op_outs_map = { "SavedMean", "SavedVariance", "ReserveSpace"}}, + {"adadelta", + {"ParamOut", + "AvgSquaredGradOut", + "AvgSquaredUpdateOut", + "MasterParamOut"}}, {"unique", {"Out", "Index", "Indices", "Counts"}}, {"unique_consecutive", {"Out", "Index", "Counts"}}, {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, @@ -400,7 +407,11 @@ std::map> op_passing_outs_map = { "MeanGradOut", "MasterParamOut"}}, {"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}}, - {"adadelta", {"ParamOut", "AvgSquaredGradOut", "AvgSquaredUpdateOut"}}, + {"adadelta", + {"ParamOut", + "AvgSquaredGradOut", + "AvgSquaredUpdateOut", + "MasterParamOut"}}, {"adagrad", {"ParamOut", "MomentOut", "MasterParamOut"}}, {"adamax", {"ParamOut", "MomentOut", "InfNormOut"}}, {"dpsgd", {"ParamOut"}}, diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index db0b930a8fe..0e47a467cec 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -20,13 +20,15 @@ data_type : x - op : adadelta_ - args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, float rho, float epsilon) - output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out) + args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor master_param, float rho, float epsilon, bool multi_precision) + output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out) infer_meta : func : AdadeltaInferMeta kernel : func : adadelta - inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out) + data_type : param + optional : master_param + inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out), (master_param -> master_param_out) - op : adagrad_ args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon, bool multi_precision) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index eac84e86256..05e4f16e646 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -38,11 +38,14 @@ void AdadeltaInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& avg_squared_grad, const MetaTensor& avg_squared_update, + const MetaTensor& master_param, float rho, float epsilon, + bool multi_precision, MetaTensor* param_out, MetaTensor* avg_squared_grad_out, - MetaTensor* avg_squared_update_out) { + MetaTensor* avg_squared_update_out, + MetaTensor* master_param_out) { auto param_dims = param.dims(); PADDLE_ENFORCE_EQ( param_dims, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index b954fdb90aa..3d161ed604c 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -43,11 +43,14 @@ void AdadeltaInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& avg_squared_grad, const MetaTensor& avg_squared_update, + const MetaTensor& master_param, float rho, float epsilon, + bool multi_precision, MetaTensor* param_out, MetaTensor* avg_squared_grad_out, - MetaTensor* avg_squared_update_out); + MetaTensor* avg_squared_update_out, + MetaTensor* master_param_outs); void AdagradInferMeta(const MetaTensor& param, const MetaTensor& grad, diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h index 65a6aad4151..15c07b3e6f9 100644 --- a/paddle/phi/kernels/adadelta_kernel.h +++ b/paddle/phi/kernels/adadelta_kernel.h @@ -24,10 +24,13 @@ void AdadeltaKernel(const Context& dev_ctx, const DenseTensor& grad, const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_update, + const paddle::optional& master_param, float rho, float epsilon, + bool multi_precision, DenseTensor* param_out, DenseTensor* avg_squared_grad_out, - DenseTensor* avg_squared_update_out); + DenseTensor* avg_squared_update_out, + DenseTensor* master_param_outs); } // namespace phi diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu index 7516a277a74..9270609d034 100644 --- a/paddle/phi/kernels/gpu/adadelta_kernel.cu +++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu @@ -18,5 +18,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/adadelta_kernel_impl.h" -PD_REGISTER_KERNEL( - adadelta, GPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {} +PD_REGISTER_KERNEL(adadelta, + GPU, + ALL_LAYOUT, + phi::AdadeltaKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h index 3fbdf435bab..b0c0a072acd 100644 --- a/paddle/phi/kernels/impl/adadelta_kernel_impl.h +++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/adadelta_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" @@ -26,40 +27,58 @@ void AdadeltaKernel(const Context& dev_ctx, const DenseTensor& grad, const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_update, + const paddle::optional& master_param, float rho, float epsilon, + bool multi_precision, DenseTensor* param_out, DenseTensor* avg_squared_grad_out, - DenseTensor* avg_squared_update_out) { + DenseTensor* avg_squared_update_out, + DenseTensor* master_param_outs) { + using MPDType = typename phi::dtype::template MPTypeTrait::Type; dev_ctx.template Alloc(param_out); - dev_ctx.template Alloc(avg_squared_grad_out); - dev_ctx.template Alloc(avg_squared_update_out); + dev_ctx.template Alloc(avg_squared_grad_out); + dev_ctx.template Alloc(avg_squared_update_out); - T rho_ = static_cast(rho); - T epsilon_ = static_cast(epsilon); + MPDType rho_ = static_cast(rho); + MPDType epsilon_ = static_cast(epsilon); auto eigen_param = EigenVector::Flatten(param); auto eigen_grad = EigenVector::Flatten(grad); // Squared gradient accumulator - auto eigen_avg_squared_grad = EigenVector::Flatten(avg_squared_grad); + auto eigen_avg_squared_grad = EigenVector::Flatten(avg_squared_grad); // Squared updates accumulator - auto eigen_avg_squared_update = EigenVector::Flatten(avg_squared_update); + auto eigen_avg_squared_update = + EigenVector::Flatten(avg_squared_update); auto eigen_param_out = EigenVector::Flatten(*param_out); auto eigen_avg_squared_grad_out = - EigenVector::Flatten(*avg_squared_grad_out); + EigenVector::Flatten(*avg_squared_grad_out); auto eigen_avg_squared_update_out = - EigenVector::Flatten(*avg_squared_update_out); + EigenVector::Flatten(*avg_squared_update_out); auto& place = *dev_ctx.eigen_device(); + auto eigen_grad_cast = eigen_grad.template cast(); + eigen_avg_squared_grad_out.device(place) = - rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad.square(); + rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad_cast.square(); auto update = -((eigen_avg_squared_update + epsilon_) / (eigen_avg_squared_grad_out + epsilon_)) .sqrt() * - eigen_grad; + eigen_grad_cast; eigen_avg_squared_update_out.device(place) = rho_ * eigen_avg_squared_update + (1 - rho_) * update.square(); - eigen_param_out.device(place) = eigen_param + update; + + if (multi_precision) { + auto eigen_master_param_out = + EigenVector::Flatten(*master_param_outs); + auto eigen_master_param = EigenVector::Flatten(*master_param); + + eigen_master_param_out.device(place) = eigen_master_param + update; + eigen_param_out.device(place) = + (eigen_param.template cast() + update).template cast(); + } else { + eigen_param_out.device(place) = eigen_param + update.template cast(); + } } } // namespace phi diff --git a/paddle/phi/kernels/xpu/adadelta_kernel.cc b/paddle/phi/kernels/xpu/adadelta_kernel.cc index 153f27f54c9..e02a5aeabad 100644 --- a/paddle/phi/kernels/xpu/adadelta_kernel.cc +++ b/paddle/phi/kernels/xpu/adadelta_kernel.cc @@ -25,11 +25,14 @@ void AdadeltaKernel(const Context& dev_ctx, const DenseTensor& grad, const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_update, + const paddle::optional& master_param, float rho, float epsilon, + bool multi_precision, DenseTensor* param_out, DenseTensor* avg_squared_grad_out, - DenseTensor* avg_squared_update_out) { + DenseTensor* avg_squared_update_out, + DenseTensor* master_param_outs) { dev_ctx.template Alloc(param_out); dev_ctx.template Alloc(avg_squared_grad_out); dev_ctx.template Alloc(avg_squared_update_out); diff --git a/paddle/phi/ops/compat/adadelta_sig.cc b/paddle/phi/ops/compat/adadelta_sig.cc new file mode 100644 index 00000000000..fd285e7e5d0 --- /dev/null +++ b/paddle/phi/ops/compat/adadelta_sig.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature AdadeltaOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("Grad")) { + return KernelSignature( + "adadelta", + {"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"}, + {"rho", "epsilon", "multi_precision"}, + {"ParamOut", + "AvgSquaredGradOut", + "AvgSquaredUpdateOut", + "MasterParamOut"}); + } + + return KernelSignature("unregistered", {}, {}, {}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(adadelta, phi::AdadeltaOpArgumentMapping); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e8f32da4cae..9a63efea402 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -3181,14 +3181,87 @@ class AdadeltaOptimizer(Optimizer): name=name, ) self.type = "adadelta" + self._multi_precision = False + self._master_weights = {} self._epsilon = epsilon self._rho = rho + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + '_fp32_master' + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = ( + self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) + ) + return self._accumulators[name][target_name] + def _create_accumulators(self, block, parameters): if not isinstance(block, framework.Block): raise TypeError("block is not instance of framework.Block.") for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._avg_squared_grad_acc_str, master_p) + self._add_accumulator( + self._avg_squared_update_acc_str, master_p + ) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) self._add_accumulator(self._avg_squared_grad_acc_str, p) self._add_accumulator(self._avg_squared_update_acc_str, p) @@ -3202,6 +3275,15 @@ class AdadeltaOptimizer(Optimizer): avg_squared_update_acc = self._get_accumulator( self._avg_squared_update_acc_str, param_and_grad[0] ) + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) if in_dygraph_mode(): _C_ops.adadelta_( @@ -3209,25 +3291,38 @@ class AdadeltaOptimizer(Optimizer): param_and_grad[1], avg_squared_grad_acc, avg_squared_update_acc, + master_weight, self._rho, self._epsilon, + find_master, ) else: # Create the adadelta optimizer op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "AvgSquaredGrad": avg_squared_grad_acc, + "AvgSquaredUpdate": avg_squared_update_acc, + } + outputs = { + "ParamOut": param_and_grad[0], + "AvgSquaredGradOut": avg_squared_grad_acc, + "AvgSquaredUpdateOut": avg_squared_update_acc, + } + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + adadelta_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "AvgSquaredGrad": avg_squared_grad_acc, - "AvgSquaredUpdate": avg_squared_update_acc, - }, - outputs={ - "ParamOut": param_and_grad[0], - "AvgSquaredGradOut": avg_squared_grad_acc, - "AvgSquaredUpdateOut": avg_squared_update_acc, + inputs=inputs, + outputs=outputs, + attrs={ + "epsilon": self._epsilon, + "rho": self._rho, + "multi_precision": find_master, }, - attrs={"epsilon": self._epsilon, "rho": self._rho}, stop_gradient=True, ) diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py index 51435ccb95f..699bf5df564 100644 --- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py +++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py @@ -203,5 +203,281 @@ class TestAdadeltaV2Group(TestAdadeltaV2): adam.clear_gradients() +class TestAdadeltaOpMultiPrecison(unittest.TestCase): + def _test_adadelta_op_dygraph_place_amp(self, place, use_amp=False): + import paddle + + paddle.disable_static() + paddle.seed(10) + paddle.set_device(place) + input = paddle.randn((5, 5)) + + model = paddle.nn.Linear(5, 5) + optimizer = paddle.optimizer.Adadelta( + learning_rate=0.01, + parameters=model.parameters(), + weight_decay=0.1, + ) + + optimizer._multi_precision = use_amp + + for idx in range(2): + if place == 'gpu' and use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + if place == 'gpu' and use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + loss.backward() + optimizer.step() + optimizer.clear_grad() + paddle.enable_static() + + def _get_places(self): + import paddle + + places = ['cpu'] + if paddle.is_compiled_with_cuda(): + places.append('gpu') + return places + + def test_main(self): + for place in self._get_places(): + use_amp_list = [True, False] + for use_amp in use_amp_list: + self._test_adadelta_op_dygraph_place_amp(place, use_amp) + + +class TestAdadeltaMultiPrecision2_0(unittest.TestCase): + def dygraph_adadelta_mp(self, mp, use_amp): + paddle.disable_static() + paddle.seed(100) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.optimizer.Adadelta( + 0.5, parameters=model.parameters() + ) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + loss.backward() + optimizer.step() + optimizer.clear_grad() + + return output, model.parameters() + + def static_adadelta_mp(self, mp, use_amp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.optimizer.Adadelta(0.1) + optimizer._multi_precision = mp + + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output1_dy, params1_dy = self.dygraph_adadelta_mp(use_amp=True, mp=True) + output2_dy, params2_dy = self.dygraph_adadelta_mp( + use_amp=False, mp=False + ) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output1_st = self.static_adadelta_mp(use_amp=True, mp=True) + output2_st = self.static_adadelta_mp(use_amp=False, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + +class TestAdadeltaMultiPrecision1_0(unittest.TestCase): + def dygraph_adadelta_mp(self, use_amp, mp): + paddle.disable_static() + paddle.seed(10) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.fluid.optimizer.Adadelta( + learning_rate=0.001, + parameter_list=model.parameters(), + ) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_gradients() + else: + output = model(input) + loss = paddle.mean(output) + optimizer.minimize(loss) + optimizer.clear_gradients() + + return output, model.parameters() + + def static_adadelta_mp(self, use_amp, mp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001) + optimizer._multi_precision = mp + + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output1_dy, params1_dy = self.dygraph_adadelta_mp(use_amp=True, mp=True) + output2_dy, params2_dy = self.dygraph_adadelta_mp( + use_amp=False, mp=False + ) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output1_st = self.static_adadelta_mp(use_amp=True, mp=True) + output2_st = self.static_adadelta_mp(use_amp=False, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py index ff0f0a13fed..5afc3aef03d 100644 --- a/python/paddle/optimizer/adadelta.py +++ b/python/paddle/optimizer/adadelta.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + +import paddle from paddle import _C_ops -from ..fluid import framework +from ..fluid import core, framework, unique_name from ..fluid.dygraph import no_grad +from ..fluid.layer_helper import LayerHelper from ..framework import in_dygraph_mode from .optimizer import Optimizer @@ -130,6 +134,8 @@ class Adadelta(Optimizer): grad_clip=grad_clip, name=name, ) + self._multi_precision = False + self._master_weights = {} self.type = "adadelta" self._epsilon = epsilon self._rho = rho @@ -138,6 +144,62 @@ class Adadelta(Optimizer): 'rho': rho, } + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_fp32_master" + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = ( + self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) + ) + return self._accumulators[name][target_name] + def _create_accumulators(self, block, parameters): if not isinstance(block, framework.Block): raise TypeError("block is not instance of framework.Block.") @@ -145,6 +207,21 @@ class Adadelta(Optimizer): parameters = parameters.get('params') for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._avg_squared_grad_acc_str, master_p) + self._add_accumulator( + self._avg_squared_update_acc_str, master_p + ) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) self._add_accumulator(self._avg_squared_grad_acc_str, p) self._add_accumulator(self._avg_squared_update_acc_str, p) @@ -158,6 +235,15 @@ class Adadelta(Optimizer): avg_squared_update_acc = self._get_accumulator( self._avg_squared_update_acc_str, param_and_grad[0] ) + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) if in_dygraph_mode(): with no_grad(): @@ -166,8 +252,10 @@ class Adadelta(Optimizer): param_and_grad[1], avg_squared_grad_acc, avg_squared_update_acc, + master_weight, self._rho, self._epsilon, + find_master, ) return None else: @@ -175,20 +263,29 @@ class Adadelta(Optimizer): raise TypeError("block is not instance of framework.Block.") # Create the adadelta optimizer op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "AvgSquaredGrad": avg_squared_grad_acc, + "AvgSquaredUpdate": avg_squared_update_acc, + } + outputs = { + "ParamOut": param_and_grad[0], + "AvgSquaredGradOut": avg_squared_grad_acc, + "AvgSquaredUpdateOut": avg_squared_update_acc, + } + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight adadelta_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "AvgSquaredGrad": avg_squared_grad_acc, - "AvgSquaredUpdate": avg_squared_update_acc, - }, - outputs={ - "ParamOut": param_and_grad[0], - "AvgSquaredGradOut": avg_squared_grad_acc, - "AvgSquaredUpdateOut": avg_squared_update_acc, + inputs=inputs, + outputs=outputs, + attrs={ + "epsilon": self._epsilon, + "rho": self._rho, + "multi_precision": find_master, }, - attrs={"epsilon": self._epsilon, "rho": self._rho}, stop_gradient=True, ) -- GitLab