未验证 提交 48060b2e 编写于 作者: N niuliling123 提交者: GitHub

Add multiprecision for rms op (#50132)

上级 798b527c
......@@ -38,6 +38,7 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor, default Tensor<float>)"
" The moving average of gradient")
.AsDispensable();
AddInput("LearningRate",
"(Tensor, default Tensor<float>) "
"The learning rate should be a tensor of size 1.");
......@@ -46,12 +47,17 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
"Input gradient of the parameter.");
AddInput("Moment",
"(Tensor, default Tensor<float>) The moment that gets updated.");
AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
AddOutput("MomentOut", "(Tensor) Output updated moment.");
AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
AddOutput("MeanGradOut",
"(Tensor) Output moving average of gradient updated value.");
AddOutput("MasterParamOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam).")
.AsDispensable();
AddAttr<float>("epsilon",
"(float, default 1e-10) Constant "
......@@ -65,6 +71,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0.0f);
AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
.SetDefault(false);
AddAttr<bool>("multi_precision",
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
AddComment(R"DOC(
Rmsprop Optimizer.
......
......@@ -148,6 +148,14 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
"Ln2Bias"}},
{"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
{"matrix_rank", {"X", "TolTensor"}},
{"rmsprop",
{"Param",
"MeanSquare",
"Grad",
"Moment",
"LearningRate",
"MeanGrad",
"MasterParam"}},
{"adam",
{"Param",
"Grad",
......@@ -311,6 +319,12 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
{"moving_average_abs_max_scale",
{"Out", "OutScale", "OutAccum", "OutState"}},
{"rmsprop",
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"}},
{"multiclass_nms3", {"Out", "NmsRoisNum"}},
{"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
......@@ -377,7 +391,12 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
// For those OPs, we need to manually specify the outs need to pass in this map.
std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"sgd", {"ParamOut", "MasterParamOut"}},
{"rmsprop", {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"}},
{"rmsprop",
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"}},
{"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}},
{"adadelta", {"ParamOut", "AvgSquaredGradOut", "AvgSquaredUpdateOut"}},
{"adagrad", {"ParamOut", "MomentOut"}},
......
......@@ -1459,15 +1459,16 @@
backward : reverse_grad
- op : rmsprop_
args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, float epsilon, float decay, float momentum, bool centered)
output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out)
args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon, float decay, float momentum, bool centered, bool multi_precision)
output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out), Tensor(master_param_out)
infer_meta :
func : RmspropInferMeta
kernel :
func : rmsprop {dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense}
rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense -> dense, dense, dense, dense}
optional : mean_grad
inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out)
func : rmsprop {dense, dense, dense, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
data_type : param
optional : mean_grad, master_param
inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out), (master_param->master_param_out)
- op : rnn
args: (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor dropout_state_in, float dropout_prob=0.0, bool is_bidirec=false, int input_size=10, int hidden_size=100, int num_layers=1, str mode="RNN_TANH", int seed=0, bool is_test=false)
......
......@@ -2313,14 +2313,17 @@ void RmspropInferMeta(const MetaTensor& param,
const MetaTensor& moment,
const MetaTensor& learning_rate,
const MetaTensor& mean_grad,
const MetaTensor& master_param,
float epsilon,
float decay,
float momentum,
bool centered,
bool multi_precision,
MetaTensor* param_out,
MetaTensor* moment_out,
MetaTensor* mean_square_out,
MetaTensor* mean_grad_out) {
MetaTensor* mean_grad_out,
MetaTensor* master_param_outs) {
if (centered) {
PADDLE_ENFORCE_NOT_NULL(
mean_grad_out,
......
......@@ -421,14 +421,17 @@ void RmspropInferMeta(const MetaTensor& param,
const MetaTensor& moment,
const MetaTensor& learning_rate,
const MetaTensor& mean_grad,
const MetaTensor& master_param,
float epsilon,
float decay,
float momentum,
bool centered,
bool multi_precision,
MetaTensor* param_out,
MetaTensor* moment_out,
MetaTensor* mean_square_out,
MetaTensor* mean_grad_out);
MetaTensor* mean_grad_out,
MetaTensor* master_param_outs);
void RnnInferMeta(const MetaTensor& x,
const std::vector<const MetaTensor*>& pre_state,
......
......@@ -17,7 +17,99 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
namespace phi {
template <typename T>
struct RmsFunctor<T, phi::CPUContext> {
RmsFunctor(const phi::CPUContext &ctx,
const DenseTensor &param,
const DenseTensor &mean_square,
const DenseTensor &grad,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs) {
auto epsilon = static_cast<T>(epsilon_t);
auto rho = static_cast<T>(decay_t);
auto momentum = static_cast<T>(momentum_t);
auto &p_tensor = param;
auto &ms_tensor = mean_square;
auto &lr_tensor = learning_rate;
auto &mom_tensor = moment;
PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out),
true,
phi::errors::InvalidArgument(
"Param and ParamOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out),
true,
phi::errors::InvalidArgument(
"Moment and MomentOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(
ms_tensor.IsSharedBufferWith(*mean_square_out),
true,
phi::errors::InvalidArgument(
"MeanSquare and MeanSquareOut must be the same Tensor"));
auto &grad_tensor = grad;
auto &place = *ctx.eigen_device();
auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
auto mg = EigenVector<T>::Flatten(*mg_tensor);
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) =
momentum * mom +
lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out;
}
};
template struct RmsFunctor<phi::GPUContext, float>;
template struct RmsFunctor<phi::GPUContext, double>;
template struct RmsFunctor<phi::GPUContext, phi::dtype::float16>;
} // namespace phi
PD_REGISTER_KERNEL(
rmsprop, CPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
......
......@@ -18,12 +18,99 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
PD_REGISTER_KERNEL(
rmsprop, GPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
namespace phi {
template <typename T>
struct RmsFunctor<T, phi::GPUContext> {
RmsFunctor(const phi::GPUContext &ctx,
const DenseTensor &param,
const DenseTensor &mean_square,
const DenseTensor &grad,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs) {
auto &p_tensor = param;
auto &ms_tensor = mean_square;
auto &lr_tensor = learning_rate;
auto &mom_tensor = moment;
auto &grad_tensor = grad;
size_t limit = static_cast<size_t>(ms_tensor.numel());
DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
funcs::ForRange<phi::GPUContext> for_range(ctx, limit);
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
MPDType *master_out_data =
multi_precision ? ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
for_range(CenteredRmspropFunctor<T, MPDType, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<MPDType>(moment_out),
ctx.template Alloc<MPDType>(mean_grad_out),
lr_tensor.data<MPDType>(),
master_out_data,
static_cast<MPDType>(decay_t),
static_cast<MPDType>(epsilon_t),
static_cast<MPDType>(momentum_t),
grad_func));
} else {
for_range(
UncenteredRmspropFunctor<T, MPDType, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<MPDType>(moment_out),
lr_tensor.data<MPDType>(),
master_out_data,
static_cast<MPDType>(decay_t),
static_cast<MPDType>(epsilon_t),
static_cast<MPDType>(momentum_t),
grad_func));
}
}
};
template struct RmsFunctor<phi::GPUContext, float>;
template struct RmsFunctor<phi::GPUContext, double>;
template struct RmsFunctor<phi::GPUContext, phi::dtype::float16>;
} // namespace phi
PD_REGISTER_KERNEL(rmsprop,
GPU,
ALL_LAYOUT,
phi::RmspropDenseKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(rmsprop_dense_param_sparse_grad,
GPU,
ALL_LAYOUT,
phi::RmspropSparseKernel,
float,
double) {}
double,
phi::dtype::float16) {}
......@@ -16,14 +16,36 @@
#include <math.h>
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/funcs/algorithm.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
#include "paddle/phi/kernels/rmsprop_kernel.h"
namespace phi {
template <typename T, typename Context>
struct RmsFunctor {
RmsFunctor(const Context &ctx,
const DenseTensor &param,
const DenseTensor &mean_square,
const DenseTensor &grad,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs);
};
template <typename T>
struct DenseRmspropGradFunctor {
inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
......@@ -47,7 +69,8 @@ struct SparseRmspropGradFunctor {
HOSTDEVICE inline T operator()(int64_t idx) const {
auto row_idx =
phi::funcs::BinarySearch(rows_, row_count_, idx / row_numel_);
return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_]
: static_cast<T>(0);
}
const T *grad_;
......@@ -56,19 +79,21 @@ struct SparseRmspropGradFunctor {
int64_t row_count_;
};
template <typename T, typename GradFunctor>
template <typename T, typename MT, typename GradFunctor>
struct UncenteredRmspropFunctor {
UncenteredRmspropFunctor(T *param,
T *ms,
T *mom,
const T *lr,
T rho,
T epsilon,
T momentum,
MT *ms,
MT *mom,
const MT *lr,
MT *master_p,
MT rho,
MT epsilon,
MT momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
master_p_(master_p),
lr_(lr),
rho_(rho),
epsilon_(epsilon),
......@@ -76,38 +101,46 @@ struct UncenteredRmspropFunctor {
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
param_[idx] -= mom_out;
MT g = static_cast<MT>(grad_functor_(idx));
MT l_rho = static_cast<MT>(1) - rho_;
MT ms_out = rho_ * ms_[idx] + l_rho * g * g;
MT mom_out = momentum_ * mom_[idx] +
static_cast<MT>(lr_[0]) * g / sqrt(ms_out + epsilon_);
MT p = master_p_ ? master_p_[idx] : static_cast<MT>(param_[idx]);
MT p_m = p - mom_out;
param_[idx] = static_cast<T>(p_m);
ms_[idx] = ms_out;
mom_[idx] = mom_out;
if (master_p_) master_p_[idx] = p_m;
}
T *param_;
T *ms_;
T *mom_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
MT *ms_;
MT *mom_;
MT *master_p_;
const MT *lr_;
MT rho_;
MT epsilon_;
MT momentum_;
GradFunctor grad_functor_;
};
template <typename T, typename GradFunctor>
template <typename T, typename MT, typename GradFunctor>
struct CenteredRmspropFunctor {
CenteredRmspropFunctor(T *param,
T *ms,
T *mom,
T *mean_grad,
const T *lr,
T rho,
T epsilon,
T momentum,
MT *ms,
MT *mom,
MT *mean_grad,
const MT *lr,
MT *master_param,
MT rho,
MT epsilon,
MT momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
master_p_(master_param),
mean_grad_(mean_grad),
lr_(lr),
rho_(rho),
......@@ -116,25 +149,32 @@ struct CenteredRmspropFunctor {
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
T mom_out = momentum_ * mom_[idx] +
lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
param_[idx] -= mom_out;
MT g = static_cast<MT>(grad_functor_(idx));
MT l_rho = static_cast<MT>(1) - rho_;
MT ms_out = rho_ * ms_[idx] + l_rho * g * g;
MT mg_out = rho_ * mean_grad_[idx] + l_rho * g;
MT mom_out =
momentum_ * mom_[idx] +
static_cast<MT>(lr_[0]) * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
MT p = master_p_ ? master_p_[idx] : static_cast<MT>(param_[idx]);
MT p_m = p - mom_out;
param_[idx] = static_cast<T>(p_m);
ms_[idx] = ms_out;
mom_[idx] = mom_out;
mean_grad_[idx] = mg_out;
if (master_p_) master_p_[idx] = p_m;
}
T *param_;
T *ms_;
T *mom_;
T *mean_grad_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
MT *ms_;
MT *mom_;
MT *master_p_;
MT *mean_grad_;
const MT *lr_;
MT rho_;
MT epsilon_;
MT momentum_;
GradFunctor grad_functor_;
};
......@@ -146,120 +186,35 @@ void RmspropDenseKernel(const Context &ctx,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out) {
auto epsilon = static_cast<T>(epsilon_t);
auto rho = static_cast<T>(decay_t);
auto momentum = static_cast<T>(momentum_t);
auto &p_tensor = param;
auto &ms_tensor = mean_square;
auto &lr_tensor = learning_rate;
auto &mom_tensor = moment;
PADDLE_ENFORCE_EQ(p_tensor.IsSharedBufferWith(*param_out),
true,
phi::errors::InvalidArgument(
"Param and ParamOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(mom_tensor.IsSharedBufferWith(*moment_out),
true,
phi::errors::InvalidArgument(
"Moment and MomentOut must be the same Tensor"));
PADDLE_ENFORCE_EQ(
ms_tensor.IsSharedBufferWith(*mean_square_out),
true,
phi::errors::InvalidArgument(
"MeanSquare and MeanSquareOut must be the same Tensor"));
size_t limit = static_cast<size_t>(ms_tensor.numel());
auto &grad_tensor = grad;
if (paddle::platform::is_cpu_place(ctx.GetPlace())) {
auto &place = *ctx.eigen_device();
auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
auto mg = EigenVector<T>::Flatten(*mg_tensor);
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) =
momentum * mom +
lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out;
} else {
DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
funcs::ForRange<Context> for_range(ctx, limit);
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) {
PADDLE_ENFORCE_EQ(
mg_tensor->Holder(),
mean_grad_out->Holder(),
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
} else {
PADDLE_ENFORCE_EQ(
mg_tensor,
mean_grad_out,
phi::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
}
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out),
ctx.template Alloc<T>(moment_out),
ctx.template Alloc<T>(mean_grad_out),
lr_tensor.data<T>(),
rho,
epsilon,
momentum,
grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out),
ctx.template Alloc<T>(moment_out),
lr_tensor.data<T>(),
rho,
epsilon,
momentum,
grad_func));
}
}
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs) {
RmsFunctor<T, Context> functor(ctx,
param,
mean_square,
grad,
moment,
learning_rate,
mean_grad_opt,
master_param,
epsilon_t,
decay_t,
momentum_t,
centered,
multi_precision,
param_out,
moment_out,
mean_square_out,
mean_grad_out,
master_param_outs);
}
template <typename T, typename Context>
......@@ -270,17 +225,21 @@ void RmspropSparseKernel(const Context &ctx,
const DenseTensor &moment,
const DenseTensor &learning_rate,
const paddle::optional<DenseTensor> &mean_grad_opt,
const paddle::optional<DenseTensor> &master_param,
float epsilon_t,
float decay_t,
float momentum_t,
bool centered,
bool multi_precision,
DenseTensor *param_out,
DenseTensor *moment_out,
DenseTensor *mean_square_out,
DenseTensor *mean_grad_out) {
auto epsilon = static_cast<T>(epsilon_t);
auto rho = static_cast<T>(decay_t);
auto momentum = static_cast<T>(momentum_t);
DenseTensor *mean_grad_out,
DenseTensor *master_param_outs) {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
auto epsilon = static_cast<MPDType>(epsilon_t);
auto rho = static_cast<MPDType>(decay_t);
auto momentum = static_cast<MPDType>(momentum_t);
auto &p_tensor = param;
auto &ms_tensor = mean_square;
......@@ -318,6 +277,10 @@ void RmspropSparseKernel(const Context &ctx,
SparseRmspropGradFunctor<T> grad_func(
merged_tensor.data<T>(), rows, row_numel, row_count);
MPDType *master_out_data =
multi_precision ? ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
if (centered) {
auto mg_tensor = mean_grad_opt.get_ptr();
if (mg_tensor) {
......@@ -334,22 +297,24 @@ void RmspropSparseKernel(const Context &ctx,
"MeanGrad and MeanGradOut must be the same Tensor"));
}
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
for_range(CenteredRmspropFunctor<T, MPDType, SparseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out),
ctx.template Alloc<T>(moment_out),
ctx.template Alloc<T>(mean_grad_out),
lr_tensor.data<T>(),
ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<MPDType>(moment_out),
ctx.template Alloc<MPDType>(mean_grad_out),
lr_tensor.data<MPDType>(),
master_out_data,
rho,
epsilon,
momentum,
grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
for_range(UncenteredRmspropFunctor<T, MPDType, SparseRmspropGradFunctor<T>>(
ctx.template Alloc<T>(param_out),
ctx.template Alloc<T>(mean_square_out),
ctx.template Alloc<T>(moment_out),
lr_tensor.data<T>(),
ctx.template Alloc<MPDType>(mean_square_out),
ctx.template Alloc<MPDType>(moment_out),
lr_tensor.data<MPDType>(),
master_out_data,
rho,
epsilon,
momentum,
......
......@@ -27,14 +27,17 @@ void RmspropDenseKernel(const Context& dev_ctx,
const DenseTensor& moment,
const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& mean_grad,
const paddle::optional<DenseTensor>& master_param,
float epsilon,
float decay,
float momentum,
bool centered,
bool multi_precision,
DenseTensor* param_out,
DenseTensor* moment_out,
DenseTensor* mean_square_out,
DenseTensor* mean_grad_out);
DenseTensor* mean_grad_out,
DenseTensor* master_param_outs);
template <typename T, typename Context>
void RmspropSparseKernel(const Context& dev_ctx,
......@@ -44,13 +47,16 @@ void RmspropSparseKernel(const Context& dev_ctx,
const DenseTensor& moment,
const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& mean_grad,
const paddle::optional<DenseTensor>& master_param,
float epsilon,
float decay,
float momentum,
bool centered,
bool multi_precision,
DenseTensor* param_out,
DenseTensor* moment_out,
DenseTensor* mean_square_out,
DenseTensor* mean_grad_out);
DenseTensor* mean_grad_out,
DenseTensor* master_param_outs);
} // namespace phi
......@@ -29,14 +29,17 @@ void RmspropDenseKernel(const Context& dev_ctx,
const DenseTensor& moment,
const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& mean_grad,
const paddle::optional<DenseTensor>& master_param,
float epsilon,
float decay,
float momentum,
bool centered,
bool multi_precision,
DenseTensor* param_out,
DenseTensor* moment_out,
DenseTensor* mean_square_out,
DenseTensor* mean_grad_out) {
DenseTensor* mean_grad_out,
DenseTensor* master_param_outs) {
// copy learning_rate to cpu
PADDLE_ENFORCE_EQ(
learning_rate.dims().size(),
......
......@@ -20,15 +20,35 @@ KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("Grad")) {
return KernelSignature(
"rmsprop",
{"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"},
{"epsilon", "decay", "momentum", "centered"},
{"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"});
{"Param",
"MeanSquare",
"Grad",
"Moment",
"LearningRate",
"MeanGrad",
"MasterParam"},
{"epsilon", "decay", "momentum", "centered", "multi_precision"},
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"});
} else if (ctx.IsSelectedRowsInput("Grad")) {
return KernelSignature(
"rmsprop_dense_param_sparse_grad",
{"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad"},
{"epsilon", "decay", "momentum", "centered"},
{"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut"});
{"Param",
"MeanSquare",
"Grad",
"Moment",
"LearningRate",
"MeanGrad",
"MasterParam"},
{"epsilon", "decay", "momentum", "centered", "multi_precision"},
{"ParamOut",
"MomentOut",
"MeanSquareOut",
"MeanGradOut",
"MasterParamOut"});
}
return KernelSignature("unregistered", {}, {}, {});
......
......@@ -3287,12 +3287,84 @@ class RMSPropOptimizer(Optimizer):
self._epsilon = epsilon
self._momentum = momentum
self._centered = centered
self._multi_precision = False
self._master_weights = {}
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + '_fp32_master'
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._momentum_acc_str, master_p)
self._add_accumulator(self._mean_square_acc_str, master_p)
self._add_accumulator(self._mean_grad_acc_str, master_p)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
......@@ -3310,6 +3382,15 @@ class RMSPropOptimizer(Optimizer):
mean_grad_acc = self._get_accumulator(
self._mean_grad_acc_str, param_and_grad[0]
)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode():
_C_ops.rmsprop_(
param_and_grad[0],
......@@ -3318,34 +3399,45 @@ class RMSPropOptimizer(Optimizer):
momentum_acc,
self._create_param_lr(param_and_grad),
mean_grad_acc,
master_weight,
self._epsilon,
self._rho,
self._momentum,
self._centered,
find_master,
)
return None
else:
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
}
outputs = {
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
rmsprop_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
},
inputs=inputs,
outputs=outputs,
attrs={
"epsilon": self._epsilon,
"decay": self._rho,
"momentum": self._momentum,
"centered": self._centered,
"multi_precision": find_master,
},
stop_gradient=True,
)
......
......@@ -356,6 +356,280 @@ class TestRMSPropV2Group(TestRMSPropV2):
adam.clear_gradients()
class TestRMSOpMultiPrecison(unittest.TestCase):
def _test_rms_op_dygraph_place_amp(self, place, use_amp=False):
import paddle
paddle.disable_static()
paddle.seed(10)
paddle.set_device(place)
input = paddle.randn((5, 5))
model = paddle.nn.Linear(5, 5)
optimizer = paddle.optimizer.RMSProp(
learning_rate=0.01,
parameters=model.parameters(),
weight_decay=0.01,
)
optimizer._multi_precision = use_amp
for idx in range(2):
if place == 'gpu' and use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
if place == 'gpu' and use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.step(optimizer)
optimizer.clear_grad()
else:
output = model(input)
loss = paddle.mean(output)
loss.backward()
optimizer.step()
optimizer.clear_grad()
paddle.enable_static()
def _get_places(self):
import paddle
places = ['cpu']
if paddle.is_compiled_with_cuda():
places.append('gpu')
return places
def test_main(self):
for place in self._get_places():
use_amp_list = [True, False]
for use_amp in use_amp_list:
self._test_rms_op_dygraph_place_amp(place, use_amp)
class TestRMSPropMultiPrecision2_0(unittest.TestCase):
def dygraph_rmsprop_mp(self, mp, use_amp):
paddle.disable_static()
paddle.seed(100)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.optimizer.RMSProp(0.5, parameters=model.parameters())
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_grad()
else:
output = model(input)
loss = paddle.mean(output)
loss.backward()
optimizer.step()
optimizer.clear_grad()
return output, model.parameters()
def static_rmsprop_mp(self, mp, use_amp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.optimizer.RMSProp(0.1)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_rmsprop_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_rmsprop_mp(use_amp=True, mp=True)
output2_st = self.static_rmsprop_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
class TestRMSPropMultiPrecision1_0(unittest.TestCase):
def dygraph_rmsprop_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.RMSProp(
learning_rate=0.001,
parameter_list=model.parameters(),
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_rmsprop_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.RMSProp(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_rmsprop_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_rmsprop_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_rmsprop_mp(use_amp=True, mp=True)
output2_st = self.static_rmsprop_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -12,10 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
import paddle
from paddle import _C_ops
from ..fluid import framework
from ..fluid import core, framework, unique_name
from ..fluid.framework import in_dygraph_mode
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer
__all__ = []
......@@ -184,6 +188,8 @@ class RMSProp(Optimizer):
self._epsilon = epsilon
self._momentum = momentum
self._centered = centered
self._multi_precision = False
self._master_weights = {}
self._default_dict = {
'rho': rho,
'epsilon': epsilon,
......@@ -191,6 +197,62 @@ class RMSProp(Optimizer):
'centered': centered,
}
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
......@@ -199,6 +261,20 @@ class RMSProp(Optimizer):
parameters = parameters.get('params')
for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._momentum_acc_str, master_p)
self._add_accumulator(self._mean_square_acc_str, master_p)
self._add_accumulator(self._mean_grad_acc_str, master_p)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
......@@ -219,6 +295,15 @@ class RMSProp(Optimizer):
mean_grad_acc = self._get_accumulator(
self._mean_grad_acc_str, param_and_grad[0]
)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode():
_C_ops.rmsprop_(
......@@ -228,29 +313,38 @@ class RMSProp(Optimizer):
momentum_acc,
self._create_param_lr(param_and_grad),
mean_grad_acc,
master_weight,
self._epsilon,
self._rho,
self._momentum,
self._centered,
find_master,
)
return None
else:
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
}
outputs = {
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
rmsprop_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc,
},
inputs=inputs,
outputs=outputs,
attrs={
"epsilon": self._epsilon,
"decay": self._rho,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册