From 48060b2e249ccfcc85c73366cb12308852eb0c18 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:49:48 +0800 Subject: [PATCH] Add multiprecision for rms op (#50132) --- .../fluid/operators/optimizers/rmsprop_op.cc | 10 + paddle/fluid/pybind/eager_generator.h | 21 +- paddle/phi/api/yaml/legacy_ops.yaml | 13 +- paddle/phi/infermeta/multiary.cc | 5 +- paddle/phi/infermeta/multiary.h | 5 +- paddle/phi/kernels/cpu/rmsprop_kernel.cc | 92 ++++++ paddle/phi/kernels/gpu/rmsprop_kernel.cu | 93 +++++- paddle/phi/kernels/impl/rmsprop_kernel_impl.h | 285 ++++++++---------- paddle/phi/kernels/rmsprop_kernel.h | 10 +- paddle/phi/kernels/xpu/rmsprop_kernel.cc | 5 +- paddle/phi/ops/compat/rmsprop_sig.cc | 32 +- python/paddle/fluid/optimizer.py | 120 +++++++- .../fluid/tests/unittests/test_rmsprop_op.py | 274 +++++++++++++++++ python/paddle/optimizer/rmsprop.py | 124 +++++++- 14 files changed, 879 insertions(+), 210 deletions(-) diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc index 3e923d34a06..64be18ddee8 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.cc +++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc @@ -38,6 +38,7 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor, default Tensor)" " The moving average of gradient") .AsDispensable(); + AddInput("LearningRate", "(Tensor, default Tensor) " "The learning rate should be a tensor of size 1."); @@ -46,12 +47,17 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { "Input gradient of the parameter."); AddInput("Moment", "(Tensor, default Tensor) The moment that gets updated."); + AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(Tensor) Output updated parameter value."); AddOutput("MomentOut", "(Tensor) Output updated moment."); AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); AddOutput("MeanGradOut", "(Tensor) Output moving average of gradient updated value."); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable(); AddAttr("epsilon", "(float, default 1e-10) Constant " @@ -65,6 +71,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0.0f); AddAttr("centered", "(bool, default false) use centered rmsprop.") .SetDefault(false); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); AddComment(R"DOC( Rmsprop Optimizer. diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h index 8101d506555..8cbba11c7b2 100644 --- a/paddle/fluid/pybind/eager_generator.h +++ b/paddle/fluid/pybind/eager_generator.h @@ -148,6 +148,14 @@ std::map> op_ins_map = { "Ln2Bias"}}, {"faster_tokenizer", {"Text", "Vocab", "TextPair"}}, {"matrix_rank", {"X", "TolTensor"}}, + {"rmsprop", + {"Param", + "MeanSquare", + "Grad", + "Moment", + "LearningRate", + "MeanGrad", + "MasterParam"}}, {"adam", {"Param", "Grad", @@ -311,6 +319,12 @@ std::map> op_outs_map = { {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}}, {"moving_average_abs_max_scale", {"Out", "OutScale", "OutAccum", "OutState"}}, + {"rmsprop", + {"ParamOut", + "MomentOut", + "MeanSquareOut", + "MeanGradOut", + "MasterParamOut"}}, {"multiclass_nms3", {"Out", "NmsRoisNum"}}, {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, @@ -377,7 +391,12 @@ std::map> op_outs_map = { // For those OPs, we need to manually specify the outs need to pass in this map. std::map> op_passing_outs_map = { {"sgd", {"ParamOut", "MasterParamOut"}}, - {"rmsprop", {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"}}, + {"rmsprop", + {"ParamOut", + "MomentOut", + "MeanSquareOut", + "MeanGradOut", + "MasterParamOut"}}, {"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}}, {"adadelta", {"ParamOut", "AvgSquaredGradOut", "AvgSquaredUpdateOut"}}, {"adagrad", {"ParamOut", "MomentOut"}}, diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index af77c6f903d..ed664385131 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -1459,15 +1459,16 @@ backward : reverse_grad - op : rmsprop_ - args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, float epsilon, float decay, float momentum, bool centered) - output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out) + args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon, float decay, float momentum, bool centered, bool multi_precision) + output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out), Tensor(master_param_out) infer_meta : func : RmspropInferMeta kernel : - func : rmsprop {dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense} - rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense -> dense, dense, dense, dense} - optional : mean_grad - inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out) + func : rmsprop {dense, dense, dense, dense, dense, dense, dense-> dense, dense, dense, dense, dense} + rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense, dense-> dense, dense, dense, dense, dense} + data_type : param + optional : mean_grad, master_param + inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out), (master_param->master_param_out) - op : rnn args: (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor dropout_state_in, float dropout_prob=0.0, bool is_bidirec=false, int input_size=10, int hidden_size=100, int num_layers=1, str mode="RNN_TANH", int seed=0, bool is_test=false) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 670f2b6cc15..61dc9e31acd 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2313,14 +2313,17 @@ void RmspropInferMeta(const MetaTensor& param, const MetaTensor& moment, const MetaTensor& learning_rate, const MetaTensor& mean_grad, + const MetaTensor& master_param, float epsilon, float decay, float momentum, bool centered, + bool multi_precision, MetaTensor* param_out, MetaTensor* moment_out, MetaTensor* mean_square_out, - MetaTensor* mean_grad_out) { + MetaTensor* mean_grad_out, + MetaTensor* master_param_outs) { if (centered) { PADDLE_ENFORCE_NOT_NULL( mean_grad_out, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index d6f997eb378..1118604275b 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -421,14 +421,17 @@ void RmspropInferMeta(const MetaTensor& param, const MetaTensor& moment, const MetaTensor& learning_rate, const MetaTensor& mean_grad, + const MetaTensor& master_param, float epsilon, float decay, float momentum, bool centered, + bool multi_precision, MetaTensor* param_out, MetaTensor* moment_out, MetaTensor* mean_square_out, - MetaTensor* mean_grad_out); + MetaTensor* mean_grad_out, + MetaTensor* master_param_outs); void RnnInferMeta(const MetaTensor& x, const std::vector& pre_state, diff --git a/paddle/phi/kernels/cpu/rmsprop_kernel.cc b/paddle/phi/kernels/cpu/rmsprop_kernel.cc index 1d60823d759..a0fe164ca12 100644 --- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc +++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc @@ -17,7 +17,99 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h" +namespace phi { +template +struct RmsFunctor { + RmsFunctor(const phi::CPUContext &ctx, + const DenseTensor ¶m, + const DenseTensor &mean_square, + const DenseTensor &grad, + const DenseTensor &moment, + const DenseTensor &learning_rate, + const paddle::optional &mean_grad_opt, + const paddle::optional &master_param, + float epsilon_t, + float decay_t, + float momentum_t, + bool centered, + bool multi_precision, + DenseTensor *param_out, + DenseTensor *moment_out, + DenseTensor *mean_square_out, + DenseTensor *mean_grad_out, + DenseTensor *master_param_outs) { + auto epsilon = static_cast(epsilon_t); + auto rho = static_cast(decay_t); + auto momentum = static_cast(momentum_t); + auto &p_tensor = param; + auto &ms_tensor = mean_square; + auto &lr_tensor = learning_rate; + auto &mom_tensor = moment; + + PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out), + true, + phi::errors::InvalidArgument( + "Param and ParamOut must be the same Tensor")); + PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out), + true, + phi::errors::InvalidArgument( + "Moment and MomentOut must be the same Tensor")); + PADDLE_ENFORCE_EQ( + ms_tensor.IsSharedBufferWith(*mean_square_out), + true, + phi::errors::InvalidArgument( + "MeanSquare and MeanSquareOut must be the same Tensor")); + + auto &grad_tensor = grad; + auto &place = *ctx.eigen_device(); + auto lr_value = lr_tensor.data()[0]; + + auto p = EigenVector::Flatten(p_tensor); + auto ms = EigenVector::Flatten(ms_tensor); + auto g = EigenVector::Flatten(grad_tensor); + auto mom = EigenVector::Flatten(mom_tensor); + + auto p_out = EigenVector::Flatten(*param_out); + auto mom_out = EigenVector::Flatten(*moment_out); + auto ms_out = EigenVector::Flatten(*mean_square_out); + + ms_out.device(place) = rho * ms + (1 - rho) * g * g; + if (centered) { + auto mg_tensor = mean_grad_opt.get_ptr(); + if (mg_tensor) { + PADDLE_ENFORCE_EQ( + mg_tensor->Holder(), + mean_grad_out->Holder(), + phi::errors::InvalidArgument( + "MeanGrad and MeanGradOut must be the same Tensor")); + } else { + PADDLE_ENFORCE_EQ( + mg_tensor, + mean_grad_out, + phi::errors::InvalidArgument( + "MeanGrad and MeanGradOut must be the same Tensor")); + } + auto mg = EigenVector::Flatten(*mg_tensor); + auto mg_out = EigenVector::Flatten(*mean_grad_out); + + mg_out.device(place) = rho * mg + (1 - rho) * g; + mom_out.device(place) = + momentum * mom + + lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt(); + } else { + mom_out.device(place) = + momentum * mom + lr_value * g / (ms_out + epsilon).sqrt(); + } + p_out.device(place) = p - mom_out; + } +}; + +template struct RmsFunctor; +template struct RmsFunctor; +template struct RmsFunctor; + +} // namespace phi PD_REGISTER_KERNEL( rmsprop, CPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/rmsprop_kernel.cu b/paddle/phi/kernels/gpu/rmsprop_kernel.cu index 071c09ea675..a38794d1533 100644 --- a/paddle/phi/kernels/gpu/rmsprop_kernel.cu +++ b/paddle/phi/kernels/gpu/rmsprop_kernel.cu @@ -18,12 +18,99 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h" -PD_REGISTER_KERNEL( - rmsprop, GPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {} +namespace phi { +template +struct RmsFunctor { + RmsFunctor(const phi::GPUContext &ctx, + const DenseTensor ¶m, + const DenseTensor &mean_square, + const DenseTensor &grad, + const DenseTensor &moment, + const DenseTensor &learning_rate, + const paddle::optional &mean_grad_opt, + const paddle::optional &master_param, + float epsilon_t, + float decay_t, + float momentum_t, + bool centered, + bool multi_precision, + DenseTensor *param_out, + DenseTensor *moment_out, + DenseTensor *mean_square_out, + DenseTensor *mean_grad_out, + DenseTensor *master_param_outs) { + auto &p_tensor = param; + auto &ms_tensor = mean_square; + auto &lr_tensor = learning_rate; + auto &mom_tensor = moment; + auto &grad_tensor = grad; + size_t limit = static_cast(ms_tensor.numel()); + DenseRmspropGradFunctor grad_func(grad_tensor.data()); + funcs::ForRange for_range(ctx, limit); + using MPDType = typename phi::dtype::MPTypeTrait::Type; + MPDType *master_out_data = + multi_precision ? ctx.template Alloc(master_param_outs) + : nullptr; + + if (centered) { + auto mg_tensor = mean_grad_opt.get_ptr(); + if (mg_tensor) { + PADDLE_ENFORCE_EQ( + mg_tensor->Holder(), + mean_grad_out->Holder(), + phi::errors::InvalidArgument( + "MeanGrad and MeanGradOut must be the same Tensor")); + } else { + PADDLE_ENFORCE_EQ( + mg_tensor, + mean_grad_out, + phi::errors::InvalidArgument( + "MeanGrad and MeanGradOut must be the same Tensor")); + } + + for_range(CenteredRmspropFunctor>( + ctx.template Alloc(param_out), + ctx.template Alloc(mean_square_out), + ctx.template Alloc(moment_out), + ctx.template Alloc(mean_grad_out), + lr_tensor.data(), + master_out_data, + static_cast(decay_t), + static_cast(epsilon_t), + static_cast(momentum_t), + grad_func)); + } else { + for_range( + UncenteredRmspropFunctor>( + ctx.template Alloc(param_out), + ctx.template Alloc(mean_square_out), + ctx.template Alloc(moment_out), + lr_tensor.data(), + master_out_data, + static_cast(decay_t), + static_cast(epsilon_t), + static_cast(momentum_t), + grad_func)); + } + } +}; +template struct RmsFunctor; +template struct RmsFunctor; +template struct RmsFunctor; +} // namespace phi + +PD_REGISTER_KERNEL(rmsprop, + GPU, + ALL_LAYOUT, + phi::RmspropDenseKernel, + float, + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(rmsprop_dense_param_sparse_grad, GPU, ALL_LAYOUT, phi::RmspropSparseKernel, float, - double) {} + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h index a0cb0a887b6..c01fbaf4191 100644 --- a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h +++ b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h @@ -16,14 +16,36 @@ #include +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/funcs/algorithm.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/selected_rows_functor.h" #include "paddle/phi/kernels/rmsprop_kernel.h" - namespace phi { +template +struct RmsFunctor { + RmsFunctor(const Context &ctx, + const DenseTensor ¶m, + const DenseTensor &mean_square, + const DenseTensor &grad, + const DenseTensor &moment, + const DenseTensor &learning_rate, + const paddle::optional &mean_grad_opt, + const paddle::optional &master_param, + float epsilon_t, + float decay_t, + float momentum_t, + bool centered, + bool multi_precision, + DenseTensor *param_out, + DenseTensor *moment_out, + DenseTensor *mean_square_out, + DenseTensor *mean_grad_out, + DenseTensor *master_param_outs); +}; + template struct DenseRmspropGradFunctor { inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} @@ -47,7 +69,8 @@ struct SparseRmspropGradFunctor { HOSTDEVICE inline T operator()(int64_t idx) const { auto row_idx = phi::funcs::BinarySearch(rows_, row_count_, idx / row_numel_); - return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0; + return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] + : static_cast(0); } const T *grad_; @@ -56,19 +79,21 @@ struct SparseRmspropGradFunctor { int64_t row_count_; }; -template +template struct UncenteredRmspropFunctor { UncenteredRmspropFunctor(T *param, - T *ms, - T *mom, - const T *lr, - T rho, - T epsilon, - T momentum, + MT *ms, + MT *mom, + const MT *lr, + MT *master_p, + MT rho, + MT epsilon, + MT momentum, const GradFunctor &grad_functor) : param_(param), ms_(ms), mom_(mom), + master_p_(master_p), lr_(lr), rho_(rho), epsilon_(epsilon), @@ -76,38 +101,46 @@ struct UncenteredRmspropFunctor { grad_functor_(grad_functor) {} HOSTDEVICE inline void operator()(int64_t idx) const { - T g = grad_functor_(idx); - T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; - T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_); - param_[idx] -= mom_out; + MT g = static_cast(grad_functor_(idx)); + MT l_rho = static_cast(1) - rho_; + MT ms_out = rho_ * ms_[idx] + l_rho * g * g; + MT mom_out = momentum_ * mom_[idx] + + static_cast(lr_[0]) * g / sqrt(ms_out + epsilon_); + MT p = master_p_ ? master_p_[idx] : static_cast(param_[idx]); + MT p_m = p - mom_out; + param_[idx] = static_cast(p_m); ms_[idx] = ms_out; mom_[idx] = mom_out; + if (master_p_) master_p_[idx] = p_m; } T *param_; - T *ms_; - T *mom_; - const T *lr_; - T rho_; - T epsilon_; - T momentum_; + MT *ms_; + MT *mom_; + MT *master_p_; + const MT *lr_; + MT rho_; + MT epsilon_; + MT momentum_; GradFunctor grad_functor_; }; -template +template struct CenteredRmspropFunctor { CenteredRmspropFunctor(T *param, - T *ms, - T *mom, - T *mean_grad, - const T *lr, - T rho, - T epsilon, - T momentum, + MT *ms, + MT *mom, + MT *mean_grad, + const MT *lr, + MT *master_param, + MT rho, + MT epsilon, + MT momentum, const GradFunctor &grad_functor) : param_(param), ms_(ms), mom_(mom), + master_p_(master_param), mean_grad_(mean_grad), lr_(lr), rho_(rho), @@ -116,25 +149,32 @@ struct CenteredRmspropFunctor { grad_functor_(grad_functor) {} HOSTDEVICE inline void operator()(int64_t idx) const { - T g = grad_functor_(idx); - T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; - T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g; - T mom_out = momentum_ * mom_[idx] + - lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_); - param_[idx] -= mom_out; + MT g = static_cast(grad_functor_(idx)); + MT l_rho = static_cast(1) - rho_; + MT ms_out = rho_ * ms_[idx] + l_rho * g * g; + MT mg_out = rho_ * mean_grad_[idx] + l_rho * g; + MT mom_out = + momentum_ * mom_[idx] + + static_cast(lr_[0]) * g / sqrt(ms_out - mg_out * mg_out + epsilon_); + + MT p = master_p_ ? master_p_[idx] : static_cast(param_[idx]); + MT p_m = p - mom_out; + param_[idx] = static_cast(p_m); ms_[idx] = ms_out; mom_[idx] = mom_out; mean_grad_[idx] = mg_out; + if (master_p_) master_p_[idx] = p_m; } T *param_; - T *ms_; - T *mom_; - T *mean_grad_; - const T *lr_; - T rho_; - T epsilon_; - T momentum_; + MT *ms_; + MT *mom_; + MT *master_p_; + MT *mean_grad_; + const MT *lr_; + MT rho_; + MT epsilon_; + MT momentum_; GradFunctor grad_functor_; }; @@ -146,120 +186,35 @@ void RmspropDenseKernel(const Context &ctx, const DenseTensor &moment, const DenseTensor &learning_rate, const paddle::optional &mean_grad_opt, + const paddle::optional &master_param, float epsilon_t, float decay_t, float momentum_t, bool centered, + bool multi_precision, DenseTensor *param_out, DenseTensor *moment_out, DenseTensor *mean_square_out, - DenseTensor *mean_grad_out) { - auto epsilon = static_cast(epsilon_t); - auto rho = static_cast(decay_t); - auto momentum = static_cast(momentum_t); - - auto &p_tensor = param; - auto &ms_tensor = mean_square; - auto &lr_tensor = learning_rate; - auto &mom_tensor = moment; - - PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out), - true, - phi::errors::InvalidArgument( - "Param and ParamOut must be the same Tensor")); - PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out), - true, - phi::errors::InvalidArgument( - "Moment and MomentOut must be the same Tensor")); - PADDLE_ENFORCE_EQ( - ms_tensor.IsSharedBufferWith(*mean_square_out), - true, - phi::errors::InvalidArgument( - "MeanSquare and MeanSquareOut must be the same Tensor")); - size_t limit = static_cast(ms_tensor.numel()); - auto &grad_tensor = grad; - if (paddle::platform::is_cpu_place(ctx.GetPlace())) { - auto &place = *ctx.eigen_device(); - auto lr_value = lr_tensor.data()[0]; - - auto p = EigenVector::Flatten(p_tensor); - auto ms = EigenVector::Flatten(ms_tensor); - auto g = EigenVector::Flatten(grad_tensor); - auto mom = EigenVector::Flatten(mom_tensor); - - auto p_out = EigenVector::Flatten(*param_out); - auto mom_out = EigenVector::Flatten(*moment_out); - auto ms_out = EigenVector::Flatten(*mean_square_out); - - ms_out.device(place) = rho * ms + (1 - rho) * g * g; - if (centered) { - auto mg_tensor = mean_grad_opt.get_ptr(); - auto mg = EigenVector::Flatten(*mg_tensor); - if (mg_tensor) { - PADDLE_ENFORCE_EQ( - mg_tensor->Holder(), - mean_grad_out->Holder(), - phi::errors::InvalidArgument( - "MeanGrad and MeanGradOut must be the same Tensor")); - } else { - PADDLE_ENFORCE_EQ( - mg_tensor, - mean_grad_out, - phi::errors::InvalidArgument( - "MeanGrad and MeanGradOut must be the same Tensor")); - } - auto mg_out = EigenVector::Flatten(*mean_grad_out); - - mg_out.device(place) = rho * mg + (1 - rho) * g; - mom_out.device(place) = - momentum * mom + - lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt(); - } else { - mom_out.device(place) = - momentum * mom + lr_value * g / (ms_out + epsilon).sqrt(); - } - p_out.device(place) = p - mom_out; - } else { - DenseRmspropGradFunctor grad_func(grad_tensor.data()); - funcs::ForRange for_range(ctx, limit); - if (centered) { - auto mg_tensor = mean_grad_opt.get_ptr(); - if (mg_tensor) { - PADDLE_ENFORCE_EQ( - mg_tensor->Holder(), - mean_grad_out->Holder(), - phi::errors::InvalidArgument( - "MeanGrad and MeanGradOut must be the same Tensor")); - } else { - PADDLE_ENFORCE_EQ( - mg_tensor, - mean_grad_out, - phi::errors::InvalidArgument( - "MeanGrad and MeanGradOut must be the same Tensor")); - } - - for_range(CenteredRmspropFunctor>( - ctx.template Alloc(param_out), - ctx.template Alloc(mean_square_out), - ctx.template Alloc(moment_out), - ctx.template Alloc(mean_grad_out), - lr_tensor.data(), - rho, - epsilon, - momentum, - grad_func)); - } else { - for_range(UncenteredRmspropFunctor>( - ctx.template Alloc(param_out), - ctx.template Alloc(mean_square_out), - ctx.template Alloc(moment_out), - lr_tensor.data(), - rho, - epsilon, - momentum, - grad_func)); - } - } + DenseTensor *mean_grad_out, + DenseTensor *master_param_outs) { + RmsFunctor functor(ctx, + param, + mean_square, + grad, + moment, + learning_rate, + mean_grad_opt, + master_param, + epsilon_t, + decay_t, + momentum_t, + centered, + multi_precision, + param_out, + moment_out, + mean_square_out, + mean_grad_out, + master_param_outs); } template @@ -270,17 +225,21 @@ void RmspropSparseKernel(const Context &ctx, const DenseTensor &moment, const DenseTensor &learning_rate, const paddle::optional &mean_grad_opt, + const paddle::optional &master_param, float epsilon_t, float decay_t, float momentum_t, bool centered, + bool multi_precision, DenseTensor *param_out, DenseTensor *moment_out, DenseTensor *mean_square_out, - DenseTensor *mean_grad_out) { - auto epsilon = static_cast(epsilon_t); - auto rho = static_cast(decay_t); - auto momentum = static_cast(momentum_t); + DenseTensor *mean_grad_out, + DenseTensor *master_param_outs) { + using MPDType = typename phi::dtype::MPTypeTrait::Type; + auto epsilon = static_cast(epsilon_t); + auto rho = static_cast(decay_t); + auto momentum = static_cast(momentum_t); auto &p_tensor = param; auto &ms_tensor = mean_square; @@ -318,6 +277,10 @@ void RmspropSparseKernel(const Context &ctx, SparseRmspropGradFunctor grad_func( merged_tensor.data(), rows, row_numel, row_count); + MPDType *master_out_data = + multi_precision ? ctx.template Alloc(master_param_outs) + : nullptr; + if (centered) { auto mg_tensor = mean_grad_opt.get_ptr(); if (mg_tensor) { @@ -334,22 +297,24 @@ void RmspropSparseKernel(const Context &ctx, "MeanGrad and MeanGradOut must be the same Tensor")); } - for_range(CenteredRmspropFunctor>( + for_range(CenteredRmspropFunctor>( ctx.template Alloc(param_out), - ctx.template Alloc(mean_square_out), - ctx.template Alloc(moment_out), - ctx.template Alloc(mean_grad_out), - lr_tensor.data(), + ctx.template Alloc(mean_square_out), + ctx.template Alloc(moment_out), + ctx.template Alloc(mean_grad_out), + lr_tensor.data(), + master_out_data, rho, epsilon, momentum, grad_func)); } else { - for_range(UncenteredRmspropFunctor>( + for_range(UncenteredRmspropFunctor>( ctx.template Alloc(param_out), - ctx.template Alloc(mean_square_out), - ctx.template Alloc(moment_out), - lr_tensor.data(), + ctx.template Alloc(mean_square_out), + ctx.template Alloc(moment_out), + lr_tensor.data(), + master_out_data, rho, epsilon, momentum, diff --git a/paddle/phi/kernels/rmsprop_kernel.h b/paddle/phi/kernels/rmsprop_kernel.h index fba2095cc8b..524c4c904a6 100644 --- a/paddle/phi/kernels/rmsprop_kernel.h +++ b/paddle/phi/kernels/rmsprop_kernel.h @@ -27,14 +27,17 @@ void RmspropDenseKernel(const Context& dev_ctx, const DenseTensor& moment, const DenseTensor& learning_rate, const paddle::optional& mean_grad, + const paddle::optional& master_param, float epsilon, float decay, float momentum, bool centered, + bool multi_precision, DenseTensor* param_out, DenseTensor* moment_out, DenseTensor* mean_square_out, - DenseTensor* mean_grad_out); + DenseTensor* mean_grad_out, + DenseTensor* master_param_outs); template void RmspropSparseKernel(const Context& dev_ctx, @@ -44,13 +47,16 @@ void RmspropSparseKernel(const Context& dev_ctx, const DenseTensor& moment, const DenseTensor& learning_rate, const paddle::optional& mean_grad, + const paddle::optional& master_param, float epsilon, float decay, float momentum, bool centered, + bool multi_precision, DenseTensor* param_out, DenseTensor* moment_out, DenseTensor* mean_square_out, - DenseTensor* mean_grad_out); + DenseTensor* mean_grad_out, + DenseTensor* master_param_outs); } // namespace phi diff --git a/paddle/phi/kernels/xpu/rmsprop_kernel.cc b/paddle/phi/kernels/xpu/rmsprop_kernel.cc index ddf02313701..d913134626c 100644 --- a/paddle/phi/kernels/xpu/rmsprop_kernel.cc +++ b/paddle/phi/kernels/xpu/rmsprop_kernel.cc @@ -29,14 +29,17 @@ void RmspropDenseKernel(const Context& dev_ctx, const DenseTensor& moment, const DenseTensor& learning_rate, const paddle::optional& mean_grad, + const paddle::optional& master_param, float epsilon, float decay, float momentum, bool centered, + bool multi_precision, DenseTensor* param_out, DenseTensor* moment_out, DenseTensor* mean_square_out, - DenseTensor* mean_grad_out) { + DenseTensor* mean_grad_out, + DenseTensor* master_param_outs) { // copy learning_rate to cpu PADDLE_ENFORCE_EQ( learning_rate.dims().size(), diff --git a/paddle/phi/ops/compat/rmsprop_sig.cc b/paddle/phi/ops/compat/rmsprop_sig.cc index 74def7d0b6a..b0027279fe6 100644 --- a/paddle/phi/ops/compat/rmsprop_sig.cc +++ b/paddle/phi/ops/compat/rmsprop_sig.cc @@ -20,15 +20,35 @@ KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("Grad")) { return KernelSignature( "rmsprop", - {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"}, - {"epsilon", "decay", "momentum", "centered"}, - {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"}); + {"Param", + "MeanSquare", + "Grad", + "Moment", + "LearningRate", + "MeanGrad", + "MasterParam"}, + {"epsilon", "decay", "momentum", "centered", "multi_precision"}, + {"ParamOut", + "MomentOut", + "MeanSquareOut", + "MeanGradOut", + "MasterParamOut"}); } else if (ctx.IsSelectedRowsInput("Grad")) { return KernelSignature( "rmsprop_dense_param_sparse_grad", - {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"}, - {"epsilon", "decay", "momentum", "centered"}, - {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"}); + {"Param", + "MeanSquare", + "Grad", + "Moment", + "LearningRate", + "MeanGrad", + "MasterParam"}, + {"epsilon", "decay", "momentum", "centered", "multi_precision"}, + {"ParamOut", + "MomentOut", + "MeanSquareOut", + "MeanGradOut", + "MasterParamOut"}); } return KernelSignature("unregistered", {}, {}, {}); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 02f66b377b7..e8864e23198 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -3287,12 +3287,84 @@ class RMSPropOptimizer(Optimizer): self._epsilon = epsilon self._momentum = momentum self._centered = centered + self._multi_precision = False + self._master_weights = {} + + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + '_fp32_master' + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = ( + self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) + ) + return self._accumulators[name][target_name] def _create_accumulators(self, block, parameters): if not isinstance(block, framework.Block): raise TypeError("block is not instance of framework.Block.") for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._momentum_acc_str, master_p) + self._add_accumulator(self._mean_square_acc_str, master_p) + self._add_accumulator(self._mean_grad_acc_str, master_p) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_grad_acc_str, p) @@ -3310,6 +3382,15 @@ class RMSPropOptimizer(Optimizer): mean_grad_acc = self._get_accumulator( self._mean_grad_acc_str, param_and_grad[0] ) + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) if in_dygraph_mode(): _C_ops.rmsprop_( param_and_grad[0], @@ -3318,34 +3399,45 @@ class RMSPropOptimizer(Optimizer): momentum_acc, self._create_param_lr(param_and_grad), mean_grad_acc, + master_weight, self._epsilon, self._rho, self._momentum, self._centered, + find_master, ) return None else: + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": momentum_acc, + "MeanSquare": mean_square_acc, + "MeanGrad": mean_grad_acc, + "LearningRate": self._create_param_lr(param_and_grad), + } + + outputs = { + "ParamOut": param_and_grad[0], + "MomentOut": momentum_acc, + "MeanSquareOut": mean_square_acc, + "MeanGradOut": mean_grad_acc, + } + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + rmsprop_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": momentum_acc, - "MeanSquare": mean_square_acc, - "MeanGrad": mean_grad_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": momentum_acc, - "MeanSquareOut": mean_square_acc, - "MeanGradOut": mean_grad_acc, - }, + inputs=inputs, + outputs=outputs, attrs={ "epsilon": self._epsilon, "decay": self._rho, "momentum": self._momentum, "centered": self._centered, + "multi_precision": find_master, }, stop_gradient=True, ) diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index 63b22fb9fc9..8363f3b934b 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -356,6 +356,280 @@ class TestRMSPropV2Group(TestRMSPropV2): adam.clear_gradients() +class TestRMSOpMultiPrecison(unittest.TestCase): + def _test_rms_op_dygraph_place_amp(self, place, use_amp=False): + import paddle + + paddle.disable_static() + paddle.seed(10) + paddle.set_device(place) + + input = paddle.randn((5, 5)) + + model = paddle.nn.Linear(5, 5) + + optimizer = paddle.optimizer.RMSProp( + learning_rate=0.01, + parameters=model.parameters(), + weight_decay=0.01, + ) + optimizer._multi_precision = use_amp + for idx in range(2): + if place == 'gpu' and use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + if place == 'gpu' and use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + loss.backward() + optimizer.step() + optimizer.clear_grad() + paddle.enable_static() + + def _get_places(self): + import paddle + + places = ['cpu'] + if paddle.is_compiled_with_cuda(): + places.append('gpu') + return places + + def test_main(self): + for place in self._get_places(): + use_amp_list = [True, False] + for use_amp in use_amp_list: + self._test_rms_op_dygraph_place_amp(place, use_amp) + + +class TestRMSPropMultiPrecision2_0(unittest.TestCase): + def dygraph_rmsprop_mp(self, mp, use_amp): + paddle.disable_static() + paddle.seed(100) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.optimizer.RMSProp(0.5, parameters=model.parameters()) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + loss.backward() + optimizer.step() + optimizer.clear_grad() + + return output, model.parameters() + + def static_rmsprop_mp(self, mp, use_amp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.optimizer.RMSProp(0.1) + optimizer._multi_precision = mp + + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True) + output2_dy, params2_dy = self.dygraph_rmsprop_mp( + use_amp=False, mp=False + ) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output1_st = self.static_rmsprop_mp(use_amp=True, mp=True) + output2_st = self.static_rmsprop_mp(use_amp=False, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + +class TestRMSPropMultiPrecision1_0(unittest.TestCase): + def dygraph_rmsprop_mp(self, use_amp, mp): + paddle.disable_static() + paddle.seed(10) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.fluid.optimizer.RMSProp( + learning_rate=0.001, + parameter_list=model.parameters(), + ) + optimizer._multi_precision = mp + if use_amp: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if use_amp: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_gradients() + else: + output = model(input) + loss = paddle.mean(output) + optimizer.minimize(loss) + optimizer.clear_gradients() + + return output, model.parameters() + + def static_rmsprop_mp(self, use_amp, mp): + paddle.enable_static() + paddle.seed(100) + np.random.seed(100) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.fluid.optimizer.RMSProp(learning_rate=0.001) + optimizer._multi_precision = mp + + if use_amp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False, + ) + with paddle.static.program_guard(train_program, startup_program): + if use_amp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16' + ) + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32' + ) + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if use_amp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + (loss_data,) = exe.run( + train_program, feed={"X": x}, fetch_list=[loss.name] + ) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True) + output2_dy, params2_dy = self.dygraph_rmsprop_mp( + use_amp=False, mp=False + ) + np.testing.assert_allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + for idx in range(len(params1_dy)): + np.testing.assert_allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + rtol=1e-05, + atol=0.1, + ) + "Test static mode" + output1_st = self.static_rmsprop_mp(use_amp=True, mp=True) + output2_st = self.static_rmsprop_mp(use_amp=False, mp=False) + for idx in range(len(output1_st)): + np.testing.assert_allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + rtol=1e-05, + atol=0.1, + ) + + if __name__ == "__main__": paddle.enable_static() unittest.main() diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 855082eae5f..65a827631d4 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + +import paddle from paddle import _C_ops -from ..fluid import framework +from ..fluid import core, framework, unique_name from ..fluid.framework import in_dygraph_mode +from ..fluid.layer_helper import LayerHelper from .optimizer import Optimizer __all__ = [] @@ -184,6 +188,8 @@ class RMSProp(Optimizer): self._epsilon = epsilon self._momentum = momentum self._centered = centered + self._multi_precision = False + self._master_weights = {} self._default_dict = { 'rho': rho, 'epsilon': epsilon, @@ -191,6 +197,62 @@ class RMSProp(Optimizer): 'centered': centered, } + def _create_master_weight(self, param): + if param.name in self._master_weights: + var = self._master_weights[param.name] + else: + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_fp32_master" + var_name = unique_name.generate(var_name) + var = paddle.static.create_global_var( + name=var_name, + shape=param.shape, + value=0, + dtype='float32', + persistable=True, + ) + block = self.helper.startup_program.global_block() + block.append_op( + type="cast", + inputs={"X": [param]}, + outputs={"Out": [var]}, + attrs={ + "in_dtype": param.dtype, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + self._master_weights[param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + Returns: + accumulator variable for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + find_master = ( + self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + ) + target_param = ( + self._master_weights[param.name] if find_master else param + ) + target_name = target_param.name + if ( + name not in self._accumulators + or target_name not in self._accumulators[name] + ): + raise Exception( + "Accumulator {} does not exist for parameter {}".format( + name, target_name + ) + ) + return self._accumulators[name][target_name] + def _create_accumulators(self, block, parameters): if not isinstance(block, framework.Block): raise TypeError("block is not instance of framework.Block.") @@ -199,6 +261,20 @@ class RMSProp(Optimizer): parameters = parameters.get('params') for p in parameters: + if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + master_p = self._create_master_weight(p) + self._add_accumulator(self._momentum_acc_str, master_p) + self._add_accumulator(self._mean_square_acc_str, master_p) + self._add_accumulator(self._mean_grad_acc_str, master_p) + continue + if ( + p.dtype == core.VarDesc.VarType.FP16 + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_grad_acc_str, p) @@ -219,6 +295,15 @@ class RMSProp(Optimizer): mean_grad_acc = self._get_accumulator( self._mean_grad_acc_str, param_and_grad[0] ) + find_master = ( + self._multi_precision + and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) if in_dygraph_mode(): _C_ops.rmsprop_( @@ -228,29 +313,38 @@ class RMSProp(Optimizer): momentum_acc, self._create_param_lr(param_and_grad), mean_grad_acc, + master_weight, self._epsilon, self._rho, self._momentum, self._centered, + find_master, ) return None else: + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": momentum_acc, + "MeanSquare": mean_square_acc, + "MeanGrad": mean_grad_acc, + "LearningRate": self._create_param_lr(param_and_grad), + } + + outputs = { + "ParamOut": param_and_grad[0], + "MomentOut": momentum_acc, + "MeanSquareOut": mean_square_acc, + "MeanGradOut": mean_grad_acc, + } + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight rmsprop_op = block.append_op( type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": momentum_acc, - "MeanSquare": mean_square_acc, - "MeanGrad": mean_grad_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": momentum_acc, - "MeanSquareOut": mean_square_acc, - "MeanGradOut": mean_grad_acc, - }, + inputs=inputs, + outputs=outputs, attrs={ "epsilon": self._epsilon, "decay": self._rho, -- GitLab