diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index 51397e210a264019e9dab5d5ca775a65d43a94b7..881b2eee6afed385960500f1211163965772455b 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -42,12 +42,16 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) " "Input exponentially weighted infinity norm"); AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); - + AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("MomentOut", "(Tensor) Output first moment"); AddOutput("InfNormOut", "(Tensor) " "Output exponentially weighted infinity norm"); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable(); AddAttr("beta1", "(float, default 0.9) " @@ -63,6 +67,10 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { "(float, default 1.0e-8) " "Constant for numerical stability") .SetDefault(1.0e-8f); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); AddComment(R"DOC( Adamax Optimizer. diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h index 0096e1795293d298399ace9b7015e7d20cf7d640..6793ce50a607bb737ff0f93982fd19d57a4a3da3 100644 --- a/paddle/fluid/pybind/eager_generator.h +++ b/paddle/fluid/pybind/eager_generator.h @@ -193,6 +193,14 @@ std::map> op_ins_map = { "Beta1Pow", "Beta2Pow", "MasterParam"}}, + {"adamax", + {"Param", + "Grad", + "LearningRate", + "Moment", + "InfNorm", + "Beta1Pow", + "MasterParam"}}, {"lamb", {"Param", "Grad", @@ -368,6 +376,8 @@ std::map> op_outs_map = { "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}}, + {"adamax", + {"ParamOut", "MomentOut", "InfNormOut", "Beta1Pow", "MasterParamOut"}}, {"sgd", {"ParamOut", "MasterParamOut"}}, {"adagrad", {"ParamOut", "MomentOut", "MasterParamOut"}}, {"lamb", @@ -413,7 +423,7 @@ std::map> op_passing_outs_map = { "AvgSquaredUpdateOut", "MasterParamOut"}}, {"adagrad", {"ParamOut", "MomentOut", "MasterParamOut"}}, - {"adamax", {"ParamOut", "MomentOut", "InfNormOut"}}, + {"adamax", {"ParamOut", "MomentOut", "InfNormOut", "MasterParamOut"}}, {"dpsgd", {"ParamOut"}}, {"decayed_adagrad", {"ParamOut", "MomentOut"}}, {"lars_momentum", {"ParamOut", "VelocityOut"}}, diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 0e47a467cec40b65a209c1a22093623cf5f56a46..529b14d4de622d85b07075feec6192ba92e1d865 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -55,13 +55,15 @@ inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs) - op : adamax_ - args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon) - output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out) + args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, Tensor master_param, float beta1, float beta2, float epsilon, bool multi_precision) + output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out), Tensor(master_param_outs) infer_meta : func : AdamaxInferMeta kernel : func : adamax - inplace : (param -> param_out), (moment -> avg_squared_grad_out), (inf_norm -> avg_squared_update_out) + data_type : param + optional : master_param + inplace : (param -> param_out), (moment -> avg_squared_grad_out), (inf_norm -> avg_squared_update_out), (master_param ->master_param_outs) - op : adamw_ args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 05e4f16e646c217e61dd98cdc051677be1a466e9..9a4f233ce8f93d35d78038896f0d0535031c4bc3 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -190,12 +190,15 @@ void AdamaxInferMeta(const MetaTensor& param, const MetaTensor& moment, const MetaTensor& inf_norm, const MetaTensor& beta1_pow, + const MetaTensor& master_param, float beta1, float beta2, float epsilon, + bool multi_precision, MetaTensor* param_out, MetaTensor* moment_out, - MetaTensor* inf_norm_out) { + MetaTensor* inf_norm_out, + MetaTensor* master_param_outs) { auto lr_dims = learning_rate.dims(); PADDLE_ENFORCE_NE( product(lr_dims), diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 3d161ed604c0c9b12949749403fd2c9d308d5780..4c5ea032a9bc8036200e70b998637e7fc9cc174e 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -69,12 +69,15 @@ void AdamaxInferMeta(const MetaTensor& param, const MetaTensor& moment, const MetaTensor& inf_norm, const MetaTensor& beta1_pow, + const MetaTensor& master_param, float beta1, float beta2, float epsilon, + bool multi_precision, MetaTensor* param_out, MetaTensor* moment_out, - MetaTensor* inf_norm_out); + MetaTensor* inf_norm_out, + MetaTensor* master_param_outs); void AdamInferMeta(const MetaTensor& param, const MetaTensor& grad, diff --git a/paddle/phi/kernels/adamax_kernel.h b/paddle/phi/kernels/adamax_kernel.h index feaf996f16266abbada655907fee68f4ab25bad3..f930901176f32ddc9e3259669a8985abe18c7dfc 100644 --- a/paddle/phi/kernels/adamax_kernel.h +++ b/paddle/phi/kernels/adamax_kernel.h @@ -26,11 +26,14 @@ void AdamaxKernel(const Context& dev_ctx, const DenseTensor& moment, const DenseTensor& inf_norm, const DenseTensor& beta1_pow, + const paddle::optional& master_param, float beta1, float beta2, float epsilon, + bool multi_precision, DenseTensor* param_out, DenseTensor* moment_out, - DenseTensor* inf_norm_out); + DenseTensor* inf_norm_out, + DenseTensor* master_param_outs); } // namespace phi diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu index 0817c531318c390ead201b1fda18da511ea5569f..eca39cb1a97a061d128a2e4403a9db5ba09b5de7 100644 --- a/paddle/phi/kernels/gpu/adamax_kernel.cu +++ b/paddle/phi/kernels/gpu/adamax_kernel.cu @@ -15,7 +15,121 @@ #include "paddle/phi/kernels/adamax_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/adamax_kernel_impl.h" +namespace phi { +template +__global__ void AdamaxGPUKernel(const T* param, + const T* grad, + const MT* learning_rate, + const MT* moment, + const MT* inf_norm, + const MT* beta1_pow, + const MT* master_param, + MT d_beta1, + MT d_beta2, + MT d_epsilon, + int num, + T* param_out, + MT* moment_out, + MT* inf_norm_out, + MT* master_param_out) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; -PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {} + MT lr = static_cast(learning_rate[0]); + MT d_pow = static_cast(beta1_pow[0]); + MT one = static_cast(1.0f); + auto l_r = lr / (one - d_pow); + + for (int index = idx; index < num; index += gridDim.x * blockDim.x) { + // load and cast input to MT + MT d_param = + master_param ? master_param[index] : static_cast(param[index]); + MT d_grad = static_cast(grad[index]); + MT d_moment = static_cast(moment[index]); + MT d_inf = static_cast(inf_norm[index]); + // compute + auto mom_out = d_beta1 * d_moment + (one - d_beta1) * d_grad; + auto norm_out = std::max(std::abs(d_grad), d_beta2 * d_inf + d_epsilon); + auto out_data = d_param - l_r * (mom_out / norm_out); + // store + param_out[index] = static_cast(out_data); + moment_out[index] = static_cast(mom_out); + inf_norm_out[index] = static_cast(norm_out); + + if (master_param_out) { + master_param_out[index] = out_data; + } + } +} + +template +void AdamaxKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment, + const DenseTensor& inf_norm, + const DenseTensor& beta1_pow, + const paddle::optional& master_param, + float beta1, + float beta2, + float epsilon, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* moment_out, + DenseTensor* inf_norm_out, + DenseTensor* master_param_outs) { + using MPDType = typename phi::dtype::template MPTypeTrait::Type; + T* param_out_data = dev_ctx.template Alloc(param_out); + MPDType* moment_out_data = dev_ctx.template Alloc(moment_out); + MPDType* inf_norm_out_data = dev_ctx.template Alloc(inf_norm_out); + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_outs) + : nullptr; + PADDLE_ENFORCE_EQ( + beta1_pow.numel(), + 1, + errors::InvalidArgument("beta1 pow's size should be 1, but received " + "value is:%d.", + beta1_pow.numel())); + + MPDType beta1_ = static_cast(beta1); + MPDType beta2_ = static_cast(beta2); + MPDType epsilon_ = static_cast(epsilon); + + int numel = param.numel(); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); + int grid = config.block_per_grid.x; + int block = config.thread_per_block.x; + auto stream = dev_ctx.stream(); + + AdamaxGPUKernel + <<>>(param.data(), + grad.data(), + learning_rate.data(), + moment.data(), + inf_norm.data(), + beta1_pow.data(), + master_in_data, + beta1_, + beta2_, + epsilon_, + numel, + param_out_data, + moment_out_data, + inf_norm_out_data, + master_out_data); +} +} // namespace phi +PD_REGISTER_KERNEL(adamax, + GPU, + ALL_LAYOUT, + phi::AdamaxKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/adamax_kernel_impl.h b/paddle/phi/kernels/impl/adamax_kernel_impl.h index bff553319a2b98814d31a3f992b98541498de149..b738e53eb0b1692e1ab2a86bca93ed0ee358321f 100644 --- a/paddle/phi/kernels/impl/adamax_kernel_impl.h +++ b/paddle/phi/kernels/impl/adamax_kernel_impl.h @@ -28,12 +28,15 @@ void AdamaxKernel(const Context& dev_ctx, const DenseTensor& moment, const DenseTensor& inf_norm, const DenseTensor& beta1_pow, + const paddle::optional& master_param, float beta1, float beta2, float epsilon, + bool multi_precision, DenseTensor* param_out, DenseTensor* moment_out, - DenseTensor* inf_norm_out) { + DenseTensor* inf_norm_out, + DenseTensor* master_param_outs) { dev_ctx.template Alloc(param_out); dev_ctx.template Alloc(moment_out); dev_ctx.template Alloc(inf_norm_out); @@ -56,10 +59,10 @@ void AdamaxKernel(const Context& dev_ctx, auto& place = *dev_ctx.eigen_device(); eigen_moment_out.device(place) = - beta1_ * eigen_moment + (1 - beta1_) * eigen_grad; + beta1_ * eigen_moment + (static_cast(1) - beta1_) * eigen_grad; eigen_inf_norm_out.device(place) = eigen_grad.abs().cwiseMax((beta2_ * eigen_inf_norm) + epsilon_); - auto lr_t = eigen_lr / (1 - eigen_beta1_pow); + auto lr_t = eigen_lr / (static_cast(1) - eigen_beta1_pow); Eigen::DSizes m_dsize(moment_out->numel()); eigen_param_out.device(place) = eigen_param - diff --git a/paddle/phi/ops/compat/adamax_sig.cc b/paddle/phi/ops/compat/adamax_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c012de3771fbadabee121987f55427144f215c4 --- /dev/null +++ b/paddle/phi/ops/compat/adamax_sig.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +KernelSignature AdamaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector in_names = {"Param", + "Grad", + "LearningRate", + "Moment", + "InfNorm", + "Beta1Pow", + "MasterParam"}; + paddle::small_vector out_names = { + "ParamOut", "MomentOut", "InfNormOut", "MasterParamOut"}; + paddle::small_vector attr_names; + attr_names.emplace_back("beta1"); + attr_names.emplace_back("beta2"); + attr_names.emplace_back("epsilon"); + attr_names.emplace_back("multi_precision"); + + if (ctx.IsDenseTensorInput("Grad")) { + return KernelSignature("adamax", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); + } else { + return KernelSignature("unregistered", {}, {}, {}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(adamax, phi::AdamaxOpArgumentMapping); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9a63efea402420cc3b5c185ccb5875a130b7da82..9b15bd1f455b8e2d0daa442b34e85bf9b3190f38 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2761,10 +2761,59 @@ class AdamaxOptimizer(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon + self._multi_precision = False + self._master_weights = {} + + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + '_fp32_master' + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var def _create_accumulators(self, block, parameters): # Create accumulator tensors for first moment and infinity norm for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._moment_acc_str, master_p) + self._add_accumulator(self._inf_norm_acc_str, master_p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=master_p, + fill_value=self._beta1, + shape=[1], + ) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._inf_norm_acc_str, p) self._add_accumulator( @@ -2774,6 +2823,34 @@ class AdamaxOptimizer(Optimizer): shape=[1], ) + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = ( + self._multi_precision and core.VarDesc.VarType.FP16 == param.dtype + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) + ) + return self._accumulators[name][target_name] + def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -2785,6 +2862,15 @@ class AdamaxOptimizer(Optimizer): self._beta1_pow_acc_str, param_and_grad[0] ) + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) if in_dygraph_mode(): _C_ops.adamax_( param_and_grad[0], @@ -2793,32 +2879,43 @@ class AdamaxOptimizer(Optimizer): moment, inf_norm, beta1_pow_acc, + master_weight, self._beta1, self._beta2, self._epsilon, + find_master, ) else: # create the adamax optimize op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment": moment, + "InfNorm": inf_norm, + "Beta1Pow": beta1_pow_acc, + } + outputs = { + "ParamOut": param_and_grad[0], + "MomentOut": moment, + "InfNormOut": inf_norm, + } + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + attrs = { + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon, + "multi_precision": find_master, + } + adamax_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad), - "Moment": moment, - "InfNorm": inf_norm, - "Beta1Pow": beta1_pow_acc, - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": moment, - "InfNormOut": inf_norm, - }, - attrs={ - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon, - }, + inputs=inputs, + outputs=outputs, + attrs=attrs, stop_gradient=True, ) diff --git a/python/paddle/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py index d2a80073e13bd68ce2d49fb152405f70b521afcb..b3e93bdd7e3fef43ffd3285aea281558b7c69ca2 100644 --- a/python/paddle/fluid/tests/unittests/test_adamax_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py @@ -17,6 +17,8 @@ import unittest import numpy as np from op_test import OpTest +import paddle + class TestAdamaxOp1(OpTest): def setUp(self): @@ -203,5 +205,268 @@ class TestAdamaxOpV2(unittest.TestCase): ) +class TestAdamaxOpMultiPrecison(unittest.TestCase): + def _test_adamax_op_dygraph_place_amp(self, place, use_amp=False): + import paddle + + paddle.disable_static() + paddle.seed(10) + paddle.set_device(place) + input = paddle.randn((5, 5)) + + model = paddle.nn.Linear(5, 5) + optimizer = paddle.optimizer.Adamax( + 0.1, beta1=0.1, parameters=model.parameters() + ) + optimizer._multi_precision = use_amp + for idx in range(2): + if place == 'gpu' and use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + if place == 'gpu' and use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + loss.backward() + optimizer.step() + optimizer.clear_grad() + paddle.enable_static() + + def _get_places(self): + import paddle + + places = ['cpu'] + if paddle.is_compiled_with_cuda(): + places.append('gpu') + return places + + def test_main(self): + for place in self._get_places(): + use_amp_list = [True, False] + for use_amp in use_amp_list: + self._test_adamax_op_dygraph_place_amp(place, use_amp) + + +class TestAdamaxMultiPrecision2_0(unittest.TestCase): + def dygraph_adamax_mp(self, mp, use_amp): + paddle.disable_static() + paddle.seed(100) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.optimizer.Adamax(0.5, parameters=model.parameters()) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + loss.backward() + optimizer.step() + optimizer.clear_grad() + + return output, model.parameters() + + def static_adamax_mp(self, mp, use_amp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.optimizer.Adamax(0.1) + optimizer._multi_precision = mp + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output1_dy, params1_dy = self.dygraph_adamax_mp(use_amp=True, mp=True) + output2_dy, params2_dy = self.dygraph_adamax_mp(use_amp=False, mp=False) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output1_st = self.static_adamax_mp(use_amp=True, mp=True) + output2_st = self.static_adamax_mp(use_amp=False, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + +class TestAdamaxMultiPrecision1_0(unittest.TestCase): + def dygraph_adamax_mp(self, use_amp, mp): + paddle.disable_static() + paddle.seed(10) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.fluid.optimizer.Adamax( + learning_rate=0.001, parameter_list=model.parameters() + ) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_gradients() + else: + output = model(input) + loss = paddle.mean(output) + optimizer.minimize(loss) + optimizer.clear_gradients() + + return output, model.parameters() + + def static_adamax_mp(self, use_amp, mp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.fluid.optimizer.Adamax(learning_rate=0.001) + optimizer._multi_precision = mp + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output1_dy, params1_dy = self.dygraph_adamax_mp(use_amp=True, mp=True) + output2_dy, params2_dy = self.dygraph_adamax_mp(use_amp=False, mp=False) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output1_st = self.static_adamax_mp(use_amp=True, mp=True) + output2_st = self.static_adamax_mp(use_amp=False, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index c460ab6be032dd4ad52ef76ac517ae32f589a73c..18d3aefec0f0b7354abdf1164a1fc1ab82d0b282 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + +import paddle from paddle import _C_ops -from ..fluid import framework +from ..fluid import core, framework, unique_name from ..fluid.dygraph import no_grad from ..fluid.framework import name_scope +from ..fluid.layer_helper import LayerHelper from .optimizer import Optimizer __all__ = [] @@ -164,26 +168,104 @@ class Adamax(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon + self._multi_precision = False + self._master_weights = {} + self._default_dict = { 'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon, } + def _add_moments_pows(self, p): + acc_dtype = p.dtype + if self._is_dtype_fp16_or_bf16(acc_dtype): + acc_dtype = core.VarDesc.VarType.FP32 + + self._add_accumulator(self._moment_acc_str, p, dtype=acc_dtype) + self._add_accumulator(self._inf_norm_acc_str, p, dtype=acc_dtype) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=self._beta1, + shape=[1], + ) + + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_fp32_master" + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var + def _create_accumulators(self, block, parameters): if isinstance(parameters, dict): parameters = self._update_param_group(parameters) # Create accumulator tensors for first moment and infinity norm for p in parameters: - self._add_accumulator(self._moment_acc_str, p) - self._add_accumulator(self._inf_norm_acc_str, p) - self._add_accumulator( - name=self._beta1_pow_acc_str, - param=p, - fill_value=self._beta1, - shape=[1], + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_moments_pows(master_p) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Adam optimizer." + ) + self._add_moments_pows(p) + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( + param.dtype + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) ) + return self._accumulators[name][target_name] def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -194,10 +276,20 @@ class Adamax(Optimizer): inf_norm = self._get_accumulator( self._inf_norm_acc_str, param_and_grad[0] ) + + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) + beta1_pow_acc = self._get_accumulator( self._beta1_pow_acc_str, param_and_grad[0] ) - if framework.in_dygraph_mode(): _C_ops.adamax_( param_and_grad[0], @@ -206,32 +298,42 @@ class Adamax(Optimizer): moment, inf_norm, beta1_pow_acc, + master_weight, self._beta1, self._beta2, self._epsilon, + find_master, ) + else: # create the adamax optimize op + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment": moment, + "InfNorm": inf_norm, + "Beta1Pow": beta1_pow_acc, + } + outputs = { + "ParamOut": param_and_grad[0], + "MomentOut": moment, + "InfNormOut": inf_norm, + } + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + attrs = { + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon, + "multi_precision": find_master, + } adamax_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad), - "Moment": moment, - "InfNorm": inf_norm, - "Beta1Pow": beta1_pow_acc, - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": moment, - "InfNormOut": inf_norm, - }, - attrs={ - "beta1": self._beta1, - "beta2": self._beta2, - "epsilon": self._epsilon, - }, + inputs=inputs, + outputs=outputs, + attrs=attrs, stop_gradient=True, )