未验证 提交 48060b2e 编写于 作者: N niuliling123 提交者: GitHub

Add multiprecision for rms op (#50132)

上级 798b527c
...@@ -38,6 +38,7 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -38,6 +38,7 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor, default Tensor<float>)" "(Tensor, default Tensor<float>)"
" The moving average of gradient") " The moving average of gradient")
.AsDispensable(); .AsDispensable();
AddInput("LearningRate", AddInput("LearningRate",
"(Tensor, default Tensor<float>) " "(Tensor, default Tensor<float>) "
"The learning rate should be a tensor of size 1."); "The learning rate should be a tensor of size 1.");
...@@ -46,12 +47,17 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -46,12 +47,17 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
"Input gradient of the parameter."); "Input gradient of the parameter.");
AddInput("Moment", AddInput("Moment",
"(Tensor, default Tensor<float>) The moment that gets updated."); "(Tensor, default Tensor<float>) The moment that gets updated.");
AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
AddOutput("ParamOut", "(Tensor) Output updated parameter value."); AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
AddOutput("MomentOut", "(Tensor) Output updated moment."); AddOutput("MomentOut", "(Tensor) Output updated moment.");
AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
AddOutput("MeanGradOut", AddOutput("MeanGradOut",
"(Tensor) Output moving average of gradient updated value."); "(Tensor) Output moving average of gradient updated value.");
AddOutput("MasterParamOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam).")
.AsDispensable();
AddAttr<float>("epsilon", AddAttr<float>("epsilon",
"(float, default 1e-10) Constant " "(float, default 1e-10) Constant "
...@@ -65,6 +71,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -65,6 +71,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.") AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("multi_precision",
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Rmsprop Optimizer. Rmsprop Optimizer.
......
...@@ -148,6 +148,14 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -148,6 +148,14 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
"Ln2Bias"}}, "Ln2Bias"}},
{"faster_tokenizer", {"Text", "Vocab", "TextPair"}}, {"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
{"matrix_rank", {"X", "TolTensor"}}, {"matrix_rank", {"X", "TolTensor"}},
{"rmsprop",
{"Param",
"MeanSquare",
"Grad",
"Moment",
"LearningRate",
"MeanGrad",
"MasterParam"}},
{"adam", {"adam",
{"Param", {"Param",
"Grad", "Grad",
...@@ -311,6 +319,12 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -311,6 +319,12 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}}, {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
{"moving_average_abs_max_scale", {"moving_average_abs_max_scale",
{"Out", "OutScale", "OutAccum", "OutState"}}, {"Out", "OutScale", "OutAccum", "OutState"}},
{"rmsprop",
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"}},
{"multiclass_nms3", {"Out", "NmsRoisNum"}}, {"multiclass_nms3", {"Out", "NmsRoisNum"}},
{"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
...@@ -377,7 +391,12 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -377,7 +391,12 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
// For those OPs, we need to manually specify the outs need to pass in this map. // For those OPs, we need to manually specify the outs need to pass in this map.
std::map<std::string, std::set<std::string>> op_passing_outs_map = { std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"sgd", {"ParamOut", "MasterParamOut"}}, {"sgd", {"ParamOut", "MasterParamOut"}},
{"rmsprop", {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"}}, {"rmsprop",
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"}},
{"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}}, {"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}},
{"adadelta", {"ParamOut", "AvgSquaredGradOut", "AvgSquaredUpdateOut"}}, {"adadelta", {"ParamOut", "AvgSquaredGradOut", "AvgSquaredUpdateOut"}},
{"adagrad", {"ParamOut", "MomentOut"}}, {"adagrad", {"ParamOut", "MomentOut"}},
......
...@@ -1459,15 +1459,16 @@ ...@@ -1459,15 +1459,16 @@
backward : reverse_grad backward : reverse_grad
- op : rmsprop_ - op : rmsprop_
args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, float epsilon, float decay, float momentum, bool centered) args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon, float decay, float momentum, bool centered, bool multi_precision)
output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out) output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out), Tensor(master_param_out)
infer_meta : infer_meta :
func : RmspropInferMeta func : RmspropInferMeta
kernel : kernel :
func : rmsprop {dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense} func : rmsprop {dense, dense, dense, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense -> dense, dense, dense, dense} rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
optional : mean_grad data_type : param
inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out) optional : mean_grad, master_param
inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out), (master_param->master_param_out)
- op : rnn - op : rnn
args: (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor dropout_state_in, float dropout_prob=0.0, bool is_bidirec=false, int input_size=10, int hidden_size=100, int num_layers=1, str mode="RNN_TANH", int seed=0, bool is_test=false) args: (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor dropout_state_in, float dropout_prob=0.0, bool is_bidirec=false, int input_size=10, int hidden_size=100, int num_layers=1, str mode="RNN_TANH", int seed=0, bool is_test=false)
......
...@@ -2313,14 +2313,17 @@ void RmspropInferMeta(const MetaTensor& param, ...@@ -2313,14 +2313,17 @@ void RmspropInferMeta(const MetaTensor& param,
const MetaTensor& moment, const MetaTensor& moment,
const MetaTensor& learning_rate, const MetaTensor& learning_rate,
const MetaTensor& mean_grad, const MetaTensor& mean_grad,
const MetaTensor& master_param,
float epsilon, float epsilon,
float decay, float decay,
float momentum, float momentum,
bool centered, bool centered,
bool multi_precision,
MetaTensor* param_out, MetaTensor* param_out,
MetaTensor* moment_out, MetaTensor* moment_out,
MetaTensor* mean_square_out, MetaTensor* mean_square_out,
MetaTensor* mean_grad_out) { MetaTensor* mean_grad_out,
MetaTensor* master_param_outs) {
if (centered) { if (centered) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
mean_grad_out, mean_grad_out,
......
...@@ -421,14 +421,17 @@ void RmspropInferMeta(const MetaTensor& param, ...@@ -421,14 +421,17 @@ void RmspropInferMeta(const MetaTensor& param,
const MetaTensor& moment, const MetaTensor& moment,
const MetaTensor& learning_rate, const MetaTensor& learning_rate,
const MetaTensor& mean_grad, const MetaTensor& mean_grad,
const MetaTensor& master_param,
float epsilon, float epsilon,
float decay, float decay,
float momentum, float momentum,
bool centered, bool centered,
bool multi_precision,
MetaTensor* param_out, MetaTensor* param_out,
MetaTensor* moment_out, MetaTensor* moment_out,
MetaTensor* mean_square_out, MetaTensor* mean_square_out,
MetaTensor* mean_grad_out); MetaTensor* mean_grad_out,
MetaTensor* master_param_outs);
void RnnInferMeta(const MetaTensor& x, void RnnInferMeta(const MetaTensor& x,
const std::vector<const MetaTensor*>& pre_state, const std::vector<const MetaTensor*>& pre_state,
......
...@@ -17,7 +17,99 @@ ...@@ -17,7 +17,99 @@
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h" #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
namespace phi {
template <typename T>
struct RmsFunctor<T, phi::CPUContext> {
RmsFunctor(const phi::CPUContext &ctx,
const DenseTensor &param,
const DenseTensor &mean_square,
const DenseTensor &grad,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs) {
auto epsilon = static_cast<T>(epsilon_t);
auto rho = static_cast<T>(decay_t);
auto momentum = static_cast<T>(momentum_t);
auto &p_tensor = param;
auto &ms_tensor = mean_square;
auto &lr_tensor = learning_rate;
auto &mom_tensor = moment;
PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out),
true,
phi::errors::InvalidArgument(
"Param and ParamOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out),
true,
phi::errors::InvalidArgument(
"Moment and MomentOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(
ms_tensor.IsSharedBufferWith(*mean_square_out),
true,
phi::errors::InvalidArgument(
"MeanSquare and MeanSquareOut must be the same Tensor"));
auto &grad_tensor = grad;
auto &place = *ctx.eigen_device();
auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
auto mg = EigenVector<T>::Flatten(*mg_tensor);
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) =
momentum * mom +
lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out;
}
};
template struct RmsFunctor<phi::GPUContext, float>;
template struct RmsFunctor<phi::GPUContext, double>;
template struct RmsFunctor<phi::GPUContext, phi::dtype::float16>;
} // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(
rmsprop, CPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {} rmsprop, CPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
......
...@@ -18,12 +18,99 @@ ...@@ -18,12 +18,99 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h" #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
PD_REGISTER_KERNEL( namespace phi {
rmsprop, GPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {} template <typename T>
struct RmsFunctor<T, phi::GPUContext> {
RmsFunctor(const phi::GPUContext &ctx,
const DenseTensor &param,
const DenseTensor &mean_square,
const DenseTensor &grad,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs) {
auto &p_tensor = param;
auto &ms_tensor = mean_square;
auto &lr_tensor = learning_rate;
auto &mom_tensor = moment;
auto &grad_tensor = grad;
size_t limit = static_cast<size_t>(ms_tensor.numel());
DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
funcs::ForRange<phi::GPUContext> for_range(ctx, limit);
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
MPDType *master_out_data =
multi_precision ? ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
for_range(CenteredRmspropFunctor<T, MPDType, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<MPDType>(moment_out),
ctx.template Alloc<MPDType>(mean_grad_out),
lr_tensor.data<MPDType>(),
master_out_data,
static_cast<MPDType>(decay_t),
static_cast<MPDType>(epsilon_t),
static_cast<MPDType>(momentum_t),
grad_func));
} else {
for_range(
UncenteredRmspropFunctor<T, MPDType, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<MPDType>(moment_out),
lr_tensor.data<MPDType>(),
master_out_data,
static_cast<MPDType>(decay_t),
static_cast<MPDType>(epsilon_t),
static_cast<MPDType>(momentum_t),
grad_func));
}
}
};
template struct RmsFunctor<phi::GPUContext, float>;
template struct RmsFunctor<phi::GPUContext, double>;
template struct RmsFunctor<phi::GPUContext, phi::dtype::float16>;
} // namespace phi
PD_REGISTER_KERNEL(rmsprop,
GPU,
ALL_LAYOUT,
phi::RmspropDenseKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(rmsprop_dense_param_sparse_grad, PD_REGISTER_KERNEL(rmsprop_dense_param_sparse_grad,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::RmspropSparseKernel, phi::RmspropSparseKernel,
float, float,
double) {} double,
phi::dtype::float16) {}
...@@ -16,14 +16,36 @@ ...@@ -16,14 +16,36 @@
#include <math.h> #include <math.h>
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/funcs/algorithm.h" #include "paddle/phi/kernels/funcs/algorithm.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/selected_rows_functor.h" #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
#include "paddle/phi/kernels/rmsprop_kernel.h" #include "paddle/phi/kernels/rmsprop_kernel.h"
namespace phi { namespace phi {
template <typename T, typename Context>
struct RmsFunctor {
RmsFunctor(const Context &ctx,
const DenseTensor &param,
const DenseTensor &mean_square,
const DenseTensor &grad,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs);
};
template <typename T> template <typename T>
struct DenseRmspropGradFunctor { struct DenseRmspropGradFunctor {
inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
...@@ -47,7 +69,8 @@ struct SparseRmspropGradFunctor { ...@@ -47,7 +69,8 @@ struct SparseRmspropGradFunctor {
HOSTDEVICE inline T operator()(int64_t idx) const { HOSTDEVICE inline T operator()(int64_t idx) const {
auto row_idx = auto row_idx =
phi::funcs::BinarySearch(rows_, row_count_, idx / row_numel_); phi::funcs::BinarySearch(rows_, row_count_, idx / row_numel_);
return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0; return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_]
: static_cast<T>(0);
} }
const T *grad_; const T *grad_;
...@@ -56,19 +79,21 @@ struct SparseRmspropGradFunctor { ...@@ -56,19 +79,21 @@ struct SparseRmspropGradFunctor {
int64_t row_count_; int64_t row_count_;
}; };
template <typename T, typename GradFunctor> template <typename T, typename MT, typename GradFunctor>
struct UncenteredRmspropFunctor { struct UncenteredRmspropFunctor {
UncenteredRmspropFunctor(T *param, UncenteredRmspropFunctor(T *param,
T *ms, MT *ms,
T *mom, MT *mom,
const T *lr, const MT *lr,
T rho, MT *master_p,
T epsilon, MT rho,
T momentum, MT epsilon,
MT momentum,
const GradFunctor &grad_functor) const GradFunctor &grad_functor)
: param_(param), : param_(param),
ms_(ms), ms_(ms),
mom_(mom), mom_(mom),
master_p_(master_p),
lr_(lr), lr_(lr),
rho_(rho), rho_(rho),
epsilon_(epsilon), epsilon_(epsilon),
...@@ -76,38 +101,46 @@ struct UncenteredRmspropFunctor { ...@@ -76,38 +101,46 @@ struct UncenteredRmspropFunctor {
grad_functor_(grad_functor) {} grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const { HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx); MT g = static_cast<MT>(grad_functor_(idx));
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; MT l_rho = static_cast<MT>(1) - rho_;
T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_); MT ms_out = rho_ * ms_[idx] + l_rho * g * g;
param_[idx] -= mom_out; MT mom_out = momentum_ * mom_[idx] +
static_cast<MT>(lr_[0]) * g / sqrt(ms_out + epsilon_);
MT p = master_p_ ? master_p_[idx] : static_cast<MT>(param_[idx]);
MT p_m = p - mom_out;
param_[idx] = static_cast<T>(p_m);
ms_[idx] = ms_out; ms_[idx] = ms_out;
mom_[idx] = mom_out; mom_[idx] = mom_out;
if (master_p_) master_p_[idx] = p_m;
} }
T *param_; T *param_;
T *ms_; MT *ms_;
T *mom_; MT *mom_;
const T *lr_; MT *master_p_;
T rho_; const MT *lr_;
T epsilon_; MT rho_;
T momentum_; MT epsilon_;
MT momentum_;
GradFunctor grad_functor_; GradFunctor grad_functor_;
}; };
template <typename T, typename GradFunctor> template <typename T, typename MT, typename GradFunctor>
struct CenteredRmspropFunctor { struct CenteredRmspropFunctor {
CenteredRmspropFunctor(T *param, CenteredRmspropFunctor(T *param,
T *ms, MT *ms,
T *mom, MT *mom,
T *mean_grad, MT *mean_grad,
const T *lr, const MT *lr,
T rho, MT *master_param,
T epsilon, MT rho,
T momentum, MT epsilon,
MT momentum,
const GradFunctor &grad_functor) const GradFunctor &grad_functor)
: param_(param), : param_(param),
ms_(ms), ms_(ms),
mom_(mom), mom_(mom),
master_p_(master_param),
mean_grad_(mean_grad), mean_grad_(mean_grad),
lr_(lr), lr_(lr),
rho_(rho), rho_(rho),
...@@ -116,25 +149,32 @@ struct CenteredRmspropFunctor { ...@@ -116,25 +149,32 @@ struct CenteredRmspropFunctor {
grad_functor_(grad_functor) {} grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const { HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx); MT g = static_cast<MT>(grad_functor_(idx));
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; MT l_rho = static_cast<MT>(1) - rho_;
T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g; MT ms_out = rho_ * ms_[idx] + l_rho * g * g;
T mom_out = momentum_ * mom_[idx] + MT mg_out = rho_ * mean_grad_[idx] + l_rho * g;
lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_); MT mom_out =
param_[idx] -= mom_out; momentum_ * mom_[idx] +
static_cast<MT>(lr_[0]) * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
MT p = master_p_ ? master_p_[idx] : static_cast<MT>(param_[idx]);
MT p_m = p - mom_out;
param_[idx] = static_cast<T>(p_m);
ms_[idx] = ms_out; ms_[idx] = ms_out;
mom_[idx] = mom_out; mom_[idx] = mom_out;
mean_grad_[idx] = mg_out; mean_grad_[idx] = mg_out;
if (master_p_) master_p_[idx] = p_m;
} }
T *param_; T *param_;
T *ms_; MT *ms_;
T *mom_; MT *mom_;
T *mean_grad_; MT *master_p_;
const T *lr_; MT *mean_grad_;
T rho_; const MT *lr_;
T epsilon_; MT rho_;
T momentum_; MT epsilon_;
MT momentum_;
GradFunctor grad_functor_; GradFunctor grad_functor_;
}; };
...@@ -146,120 +186,35 @@ void RmspropDenseKernel(const Context &ctx, ...@@ -146,120 +186,35 @@ void RmspropDenseKernel(const Context &ctx,
const DenseTensor &moment, const DenseTensor &moment,
const DenseTensor &learning_rate, const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt, const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t, float epsilon_t,
float decay_t, float decay_t,
float momentum_t, float momentum_t,
bool centered, bool centered,
bool multi_precision,
DenseTensor *param_out, DenseTensor *param_out,
DenseTensor *moment_out, DenseTensor *moment_out,
DenseTensor *mean_square_out, DenseTensor *mean_square_out,
DenseTensor *mean_grad_out) { DenseTensor *mean_grad_out,
auto epsilon = static_cast<T>(epsilon_t); DenseTensor *master_param_outs) {
auto rho = static_cast<T>(decay_t); RmsFunctor<T, Context> functor(ctx,
auto momentum = static_cast<T>(momentum_t); param,
mean_square,
auto &p_tensor = param; grad,
auto &ms_tensor = mean_square; moment,
auto &lr_tensor = learning_rate; learning_rate,
auto &mom_tensor = moment; mean_grad_opt,
master_param,
PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out), epsilon_t,
true, decay_t,
phi::errors::InvalidArgument( momentum_t,
"Param and ParamOut must be the same Tensor")); centered,
PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out), multi_precision,
true, param_out,
phi::errors::InvalidArgument( moment_out,
"Moment and MomentOut must be the same Tensor")); mean_square_out,
PADDLE_ENFORCE_EQ( mean_grad_out,
ms_tensor.IsSharedBufferWith(*mean_square_out), master_param_outs);
true,
phi::errors::InvalidArgument(
"MeanSquare and MeanSquareOut must be the same Tensor"));
size_t limit = static_cast<size_t>(ms_tensor.numel());
auto &grad_tensor = grad;
if (paddle::platform::is_cpu_place(ctx.GetPlace())) {
auto &place = *ctx.eigen_device();
auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
auto mg = EigenVector<T>::Flatten(*mg_tensor);
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) =
momentum * mom +
lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out;
} else {
DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
funcs::ForRange<Context> for_range(ctx, limit);
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out),
ctx.template Alloc<T>(moment_out),
ctx.template Alloc<T>(mean_grad_out),
lr_tensor.data<T>(),
rho,
epsilon,
momentum,
grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out),
ctx.template Alloc<T>(moment_out),
lr_tensor.data<T>(),
rho,
epsilon,
momentum,
grad_func));
}
}
} }
template <typename T, typename Context> template <typename T, typename Context>
...@@ -270,17 +225,21 @@ void RmspropSparseKernel(const Context &ctx, ...@@ -270,17 +225,21 @@ void RmspropSparseKernel(const Context &ctx,
const DenseTensor &moment, const DenseTensor &moment,
const DenseTensor &learning_rate, const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt, const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t, float epsilon_t,
float decay_t, float decay_t,
float momentum_t, float momentum_t,
bool centered, bool centered,
bool multi_precision,
DenseTensor *param_out, DenseTensor *param_out,
DenseTensor *moment_out, DenseTensor *moment_out,
DenseTensor *mean_square_out, DenseTensor *mean_square_out,
DenseTensor *mean_grad_out) { DenseTensor *mean_grad_out,
auto epsilon = static_cast<T>(epsilon_t); DenseTensor *master_param_outs) {
auto rho = static_cast<T>(decay_t); using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
auto momentum = static_cast<T>(momentum_t); auto epsilon = static_cast<MPDType>(epsilon_t);
auto rho = static_cast<MPDType>(decay_t);
auto momentum = static_cast<MPDType>(momentum_t);
auto &p_tensor = param; auto &p_tensor = param;
auto &ms_tensor = mean_square; auto &ms_tensor = mean_square;
...@@ -318,6 +277,10 @@ void RmspropSparseKernel(const Context &ctx, ...@@ -318,6 +277,10 @@ void RmspropSparseKernel(const Context &ctx,
SparseRmspropGradFunctor<T> grad_func( SparseRmspropGradFunctor<T> grad_func(
merged_tensor.data<T>(), rows, row_numel, row_count); merged_tensor.data<T>(), rows, row_numel, row_count);
MPDType *master_out_data =
multi_precision ? ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
if (centered) { if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr(); auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) { if (mg_tensor) {
...@@ -334,22 +297,24 @@ void RmspropSparseKernel(const Context &ctx, ...@@ -334,22 +297,24 @@ void RmspropSparseKernel(const Context &ctx,
"MeanGrad and MeanGradOut must be the same Tensor")); "MeanGrad and MeanGradOut must be the same Tensor"));
} }
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>( for_range(CenteredRmspropFunctor<T, MPDType, SparseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out), ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out), ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<T>(moment_out), ctx.template Alloc<MPDType>(moment_out),
ctx.template Alloc<T>(mean_grad_out), ctx.template Alloc<MPDType>(mean_grad_out),
lr_tensor.data<T>(), lr_tensor.data<MPDType>(),
master_out_data,
rho, rho,
epsilon, epsilon,
momentum, momentum,
grad_func)); grad_func));
} else { } else {
for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>( for_range(UncenteredRmspropFunctor<T, MPDType, SparseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out), ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out), ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<T>(moment_out), ctx.template Alloc<MPDType>(moment_out),
lr_tensor.data<T>(), lr_tensor.data<MPDType>(),
master_out_data,
rho, rho,
epsilon, epsilon,
momentum, momentum,
......
...@@ -27,14 +27,17 @@ void RmspropDenseKernel(const Context& dev_ctx, ...@@ -27,14 +27,17 @@ void RmspropDenseKernel(const Context& dev_ctx,
const DenseTensor& moment, const DenseTensor& moment,
const DenseTensor& learning_rate, const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& mean_grad, const paddle::optional<DenseTensor>& mean_grad,
const paddle::optional<DenseTensor>& master_param,
float epsilon, float epsilon,
float decay, float decay,
float momentum, float momentum,
bool centered, bool centered,
bool multi_precision,
DenseTensor* param_out, DenseTensor* param_out,
DenseTensor* moment_out, DenseTensor* moment_out,
DenseTensor* mean_square_out, DenseTensor* mean_square_out,
DenseTensor* mean_grad_out); DenseTensor* mean_grad_out,
DenseTensor* master_param_outs);
template <typename T, typename Context> template <typename T, typename Context>
void RmspropSparseKernel(const Context& dev_ctx, void RmspropSparseKernel(const Context& dev_ctx,
...@@ -44,13 +47,16 @@ void RmspropSparseKernel(const Context& dev_ctx, ...@@ -44,13 +47,16 @@ void RmspropSparseKernel(const Context& dev_ctx,
const DenseTensor& moment, const DenseTensor& moment,
const DenseTensor& learning_rate, const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& mean_grad, const paddle::optional<DenseTensor>& mean_grad,
const paddle::optional<DenseTensor>& master_param,
float epsilon, float epsilon,
float decay, float decay,
float momentum, float momentum,
bool centered, bool centered,
bool multi_precision,
DenseTensor* param_out, DenseTensor* param_out,
DenseTensor* moment_out, DenseTensor* moment_out,
DenseTensor* mean_square_out, DenseTensor* mean_square_out,
DenseTensor* mean_grad_out); DenseTensor* mean_grad_out,
DenseTensor* master_param_outs);
} // namespace phi } // namespace phi
...@@ -29,14 +29,17 @@ void RmspropDenseKernel(const Context& dev_ctx, ...@@ -29,14 +29,17 @@ void RmspropDenseKernel(const Context& dev_ctx,
const DenseTensor& moment, const DenseTensor& moment,
const DenseTensor& learning_rate, const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& mean_grad, const paddle::optional<DenseTensor>& mean_grad,
const paddle::optional<DenseTensor>& master_param,
float epsilon, float epsilon,
float decay, float decay,
float momentum, float momentum,
bool centered, bool centered,
bool multi_precision,
DenseTensor* param_out, DenseTensor* param_out,
DenseTensor* moment_out, DenseTensor* moment_out,
DenseTensor* mean_square_out, DenseTensor* mean_square_out,
DenseTensor* mean_grad_out) { DenseTensor* mean_grad_out,
DenseTensor* master_param_outs) {
// copy learning_rate to cpu // copy learning_rate to cpu
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
learning_rate.dims().size(), learning_rate.dims().size(),
......
...@@ -20,15 +20,35 @@ KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) { ...@@ -20,15 +20,35 @@ KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("Grad")) { if (ctx.IsDenseTensorInput("Grad")) {
return KernelSignature( return KernelSignature(
"rmsprop", "rmsprop",
{"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"}, {"Param",
{"epsilon", "decay", "momentum", "centered"}, "MeanSquare",
{"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"}); "Grad",
"Moment",
"LearningRate",
"MeanGrad",
"MasterParam"},
{"epsilon", "decay", "momentum", "centered", "multi_precision"},
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"});
} else if (ctx.IsSelectedRowsInput("Grad")) { } else if (ctx.IsSelectedRowsInput("Grad")) {
return KernelSignature( return KernelSignature(
"rmsprop_dense_param_sparse_grad", "rmsprop_dense_param_sparse_grad",
{"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"}, {"Param",
{"epsilon", "decay", "momentum", "centered"}, "MeanSquare",
{"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"}); "Grad",
"Moment",
"LearningRate",
"MeanGrad",
"MasterParam"},
{"epsilon", "decay", "momentum", "centered", "multi_precision"},
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"});
} }
return KernelSignature("unregistered", {}, {}, {}); return KernelSignature("unregistered", {}, {}, {});
......
...@@ -3287,12 +3287,84 @@ class RMSPropOptimizer(Optimizer): ...@@ -3287,12 +3287,84 @@ class RMSPropOptimizer(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
self._momentum = momentum self._momentum = momentum
self._centered = centered self._centered = centered
self._multi_precision = False
self._master_weights = {}
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + '_fp32_master'
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._momentum_acc_str, master_p)
self._add_accumulator(self._mean_square_acc_str, master_p)
self._add_accumulator(self._mean_grad_acc_str, master_p)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p) self._add_accumulator(self._mean_grad_acc_str, p)
...@@ -3310,6 +3382,15 @@ class RMSPropOptimizer(Optimizer): ...@@ -3310,6 +3382,15 @@ class RMSPropOptimizer(Optimizer):
mean_grad_acc = self._get_accumulator( mean_grad_acc = self._get_accumulator(
self._mean_grad_acc_str, param_and_grad[0] self._mean_grad_acc_str, param_and_grad[0]
) )
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode(): if in_dygraph_mode():
_C_ops.rmsprop_( _C_ops.rmsprop_(
param_and_grad[0], param_and_grad[0],
...@@ -3318,34 +3399,45 @@ class RMSPropOptimizer(Optimizer): ...@@ -3318,34 +3399,45 @@ class RMSPropOptimizer(Optimizer):
momentum_acc, momentum_acc,
self._create_param_lr(param_and_grad), self._create_param_lr(param_and_grad),
mean_grad_acc, mean_grad_acc,
master_weight,
self._epsilon, self._epsilon,
self._rho, self._rho,
self._momentum, self._momentum,
self._centered, self._centered,
find_master,
) )
return None return None
else: else:
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
}
outputs = {
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
rmsprop_op = block.append_op( rmsprop_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs=inputs,
"Param": param_and_grad[0], outputs=outputs,
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
},
attrs={ attrs={
"epsilon": self._epsilon, "epsilon": self._epsilon,
"decay": self._rho, "decay": self._rho,
"momentum": self._momentum, "momentum": self._momentum,
"centered": self._centered, "centered": self._centered,
"multi_precision": find_master,
}, },
stop_gradient=True, stop_gradient=True,
) )
......
...@@ -356,6 +356,280 @@ class TestRMSPropV2Group(TestRMSPropV2): ...@@ -356,6 +356,280 @@ class TestRMSPropV2Group(TestRMSPropV2):
adam.clear_gradients() adam.clear_gradients()
class TestRMSOpMultiPrecison(unittest.TestCase):
def _test_rms_op_dygraph_place_amp(self, place, use_amp=False):
import paddle
paddle.disable_static()
paddle.seed(10)
paddle.set_device(place)
input = paddle.randn((5, 5))
model = paddle.nn.Linear(5, 5)
optimizer = paddle.optimizer.RMSProp(
learning_rate=0.01,
parameters=model.parameters(),
weight_decay=0.01,
)
optimizer._multi_precision = use_amp
for idx in range(2):
if place == 'gpu' and use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
if place == 'gpu' and use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.step(optimizer)
optimizer.clear_grad()
else:
output = model(input)
loss = paddle.mean(output)
loss.backward()
optimizer.step()
optimizer.clear_grad()
paddle.enable_static()
def _get_places(self):
import paddle
places = ['cpu']
if paddle.is_compiled_with_cuda():
places.append('gpu')
return places
def test_main(self):
for place in self._get_places():
use_amp_list = [True, False]
for use_amp in use_amp_list:
self._test_rms_op_dygraph_place_amp(place, use_amp)
class TestRMSPropMultiPrecision2_0(unittest.TestCase):
def dygraph_rmsprop_mp(self, mp, use_amp):
paddle.disable_static()
paddle.seed(100)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.optimizer.RMSProp(0.5, parameters=model.parameters())
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_grad()
else:
output = model(input)
loss = paddle.mean(output)
loss.backward()
optimizer.step()
optimizer.clear_grad()
return output, model.parameters()
def static_rmsprop_mp(self, mp, use_amp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.optimizer.RMSProp(0.1)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_rmsprop_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_rmsprop_mp(use_amp=True, mp=True)
output2_st = self.static_rmsprop_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
class TestRMSPropMultiPrecision1_0(unittest.TestCase):
def dygraph_rmsprop_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.RMSProp(
learning_rate=0.001,
parameter_list=model.parameters(),
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_rmsprop_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.RMSProp(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_rmsprop_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_rmsprop_mp(use_amp=True, mp=True)
output2_st = self.static_rmsprop_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
...@@ -12,10 +12,14 @@ ...@@ -12,10 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import warnings
import paddle
from paddle import _C_ops from paddle import _C_ops
from ..fluid import framework from ..fluid import core, framework, unique_name
from ..fluid.framework import in_dygraph_mode from ..fluid.framework import in_dygraph_mode
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -184,6 +188,8 @@ class RMSProp(Optimizer): ...@@ -184,6 +188,8 @@ class RMSProp(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
self._momentum = momentum self._momentum = momentum
self._centered = centered self._centered = centered
self._multi_precision = False
self._master_weights = {}
self._default_dict = { self._default_dict = {
'rho': rho, 'rho': rho,
'epsilon': epsilon, 'epsilon': epsilon,
...@@ -191,6 +197,62 @@ class RMSProp(Optimizer): ...@@ -191,6 +197,62 @@ class RMSProp(Optimizer):
'centered': centered, 'centered': centered,
} }
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
...@@ -199,6 +261,20 @@ class RMSProp(Optimizer): ...@@ -199,6 +261,20 @@ class RMSProp(Optimizer):
parameters = parameters.get('params') parameters = parameters.get('params')
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._momentum_acc_str, master_p)
self._add_accumulator(self._mean_square_acc_str, master_p)
self._add_accumulator(self._mean_grad_acc_str, master_p)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p) self._add_accumulator(self._mean_grad_acc_str, p)
...@@ -219,6 +295,15 @@ class RMSProp(Optimizer): ...@@ -219,6 +295,15 @@ class RMSProp(Optimizer):
mean_grad_acc = self._get_accumulator( mean_grad_acc = self._get_accumulator(
self._mean_grad_acc_str, param_and_grad[0] self._mean_grad_acc_str, param_and_grad[0]
) )
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode(): if in_dygraph_mode():
_C_ops.rmsprop_( _C_ops.rmsprop_(
...@@ -228,29 +313,38 @@ class RMSProp(Optimizer): ...@@ -228,29 +313,38 @@ class RMSProp(Optimizer):
momentum_acc, momentum_acc,
self._create_param_lr(param_and_grad), self._create_param_lr(param_and_grad),
mean_grad_acc, mean_grad_acc,
master_weight,
self._epsilon, self._epsilon,
self._rho, self._rho,
self._momentum, self._momentum,
self._centered, self._centered,
find_master,
) )
return None return None
else: else:
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
}
outputs = {
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
rmsprop_op = block.append_op( rmsprop_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs=inputs,
"Param": param_and_grad[0], outputs=outputs,
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
},
attrs={ attrs={
"epsilon": self._epsilon, "epsilon": self._epsilon,
"decay": self._rho, "decay": self._rho,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册