未验证 提交 dace3fd5 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] support global accumulator for adam (#32780)

* add use_global_beta_pow

* add use_global_beta_pow

* update npu kernel

* update python api

* refine code

* add ut for use_global_beta_pow

* fix npu kernel

* add ut for api

* add ut for exception

* add ut for save/load
上级 48fc16f2
...@@ -198,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -198,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default false) " "(bool, default false) "
"Whether to use multi-precision during weight updating.") "Whether to use multi-precision during weight updating.")
.SetDefault(false); .SetDefault(false);
// TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
// as dispensable since they are not used when use_global_beta_pow is true.
AddAttr<bool>("use_global_beta_pow",
"(bool, default false) "
"Whether to use global beta_pow for whole model instead of "
"creating beta_pow for each parameter.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Adam Optimizer. Adam Optimizer.
...@@ -246,4 +253,16 @@ REGISTER_OP_VERSION(adam) ...@@ -246,4 +253,16 @@ REGISTER_OP_VERSION(adam)
"EpsilonTensor", "EpsilonTensor",
"If provided, Adam will use this as epsilon, " "If provided, Adam will use this as epsilon, "
"this has a higher priority than attr(epsilon). " "this has a higher priority than attr(epsilon). "
"For better performance in npu kernel. ")); "For better performance in npu kernel. "))
.AddCheckpoint(
R"ROC(
Upgrade adam, add 1 attribute [use_global_beta_pow].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"use_global_beta_pow",
"If true, Adam will use global beta_pow for whole model "
"instead of creating beta_pow for each parameter."
"In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
"used in adam op, "
"and beta_pow will be updated after all adam op in the model.",
false));
...@@ -154,6 +154,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> { ...@@ -154,6 +154,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
int64_t min_row_size_to_use_multithread = int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread"); ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode"); bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
auto* param = ctx.Input<LoDTensor>("Param"); auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad"); auto* grad_var = ctx.InputVar("Grad");
...@@ -254,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> { ...@@ -254,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
lr->data<MPDType>(), grad->data<T>(), param->data<T>(), lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data, param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel()); master_out_data, param->numel());
// Cpu update if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] = // Cpu update
beta1 * beta1_pow->data<MPDType>()[0]; beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] = beta1 * beta1_pow->data<MPDType>()[0];
beta2 * beta2_pow->data<MPDType>()[0]; beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else { } else {
AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>( AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(), beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
...@@ -269,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> { ...@@ -269,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
lr->data<MPDType>(), grad->data<T>(), param->data<T>(), lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data, param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel()); master_out_data, param->numel());
// Update with gpu if (!use_global_beta_pow) {
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>( // Update with gpu
beta1, beta2, beta1_pow->data<MPDType>(), UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta2_pow->data<MPDType>(), beta1, beta2, beta1_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()), beta2_pow->data<MPDType>(),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace())); beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
} }
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad"); auto* grad = ctx.Input<framework::SelectedRows>("Grad");
if (grad->rows().size() == 0) { if (grad->rows().size() == 0) {
...@@ -328,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> { ...@@ -328,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data, param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, rows, row_numel, grad_merge.rows().size(), master_out_data, rows, row_numel, grad_merge.rows().size(),
lazy_mode, ndim); lazy_mode, ndim);
// Update with cpu if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] = // Update with cpu
beta1 * beta1_pow->data<MPDType>()[0]; beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] = beta1 * beta1_pow->data<MPDType>()[0];
beta2 * beta2_pow->data<MPDType>()[0]; beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else { } else {
SparseAdamFunctor<T, GPUAdam, MPDType> functor( SparseAdamFunctor<T, GPUAdam, MPDType> functor(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(), beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
...@@ -351,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> { ...@@ -351,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
ctx.device_context()), ctx.device_context()),
param->numel()); param->numel());
for_range(functor); for_range(functor);
// update beta1 and beta2 if (!use_global_beta_pow) {
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>( // update beta1 and beta2
beta1, beta2, beta1_pow->data<MPDType>(), UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta2_pow->data<MPDType>(), beta1, beta2, beta1_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()), beta2_pow->data<MPDType>(),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace())); beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
} }
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
......
...@@ -406,6 +406,8 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -406,6 +406,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
int64_t min_row_size_to_use_multithread = int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread"); ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode"); bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
auto* param = ctx.Input<LoDTensor>("Param"); auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad"); auto* grad_var = ctx.InputVar("Grad");
...@@ -475,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -475,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
lr->data<T>(), grad->data<T>(), param->data<T>(), lr->data<T>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace())); param_out->mutable_data<T>(ctx.GetPlace()));
functor(param->numel()); functor(param->numel());
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] = if (!use_global_beta_pow) {
beta1 * beta1_pow->data<T>()[0]; beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] = beta1 * beta1_pow->data<T>()[0];
beta2 * beta2_pow->data<T>()[0]; beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
}
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad"); auto* grad = ctx.Input<framework::SelectedRows>("Grad");
if (grad->rows().size() == 0) { if (grad->rows().size() == 0) {
...@@ -523,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -523,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()), rows, row_numel, param_out->mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size(), lazy_mode); grad_merge.rows().size(), lazy_mode);
// update beta1 and beta2 // update beta1 and beta2
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] = if (!use_global_beta_pow) {
beta1 * beta1_pow->data<T>()[0]; beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] = beta1 * beta1_pow->data<T>()[0];
beta2 * beta2_pow->data<T>()[0]; beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
}
if (lazy_mode) { if (lazy_mode) {
VLOG(3) << "run cpu lazy mode"; VLOG(3) << "run cpu lazy mode";
size_t row_count = grad_merge.rows().size(); size_t row_count = grad_merge.rows().size();
......
...@@ -49,8 +49,8 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -49,8 +49,8 @@ class AdamNPUKernel : public framework::OpKernel<T> {
auto* mom2 = ctx.Input<LoDTensor>("Moment2"); auto* mom2 = ctx.Input<LoDTensor>("Moment2");
auto* lr = ctx.Input<LoDTensor>("LearningRate"); auto* lr = ctx.Input<LoDTensor>("LearningRate");
auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow"); auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow"); auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
auto* param_out = ctx.Output<LoDTensor>("ParamOut"); auto* param_out = ctx.Output<LoDTensor>("ParamOut");
auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out"); auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
...@@ -58,25 +58,28 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -58,25 +58,28 @@ class AdamNPUKernel : public framework::OpKernel<T> {
auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut"); auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut"); auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
mom1_out->mutable_data<T>(ctx.GetPlace()); mom1_out->mutable_data<T>(ctx.GetPlace());
mom2_out->mutable_data<T>(ctx.GetPlace()); mom2_out->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place. // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
// place.
LoDTensor beta1_pow_tmp;
LoDTensor beta2_pow_tmp;
if (beta1_pow->place() == platform::CPUPlace()) { if (beta1_pow->place() == platform::CPUPlace()) {
T beta1 = *beta1_pow->data<T>(); T beta1 = *beta1_pow->data<T>();
// `mutable_data` operation needs to be done after getting data beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
beta1_pow_out->mutable_data<T>(ctx.GetPlace()); FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
FillNpuTensorWithConstant<T>(beta1_pow_out, beta1); beta1_pow = &beta1_pow_tmp;
} else {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
} }
if (beta2_pow->place() == platform::CPUPlace()) { if (beta2_pow->place() == platform::CPUPlace()) {
T beta2 = *beta2_pow->data<T>(); T beta2 = *beta2_pow->data<T>();
beta2_pow_out->mutable_data<T>(ctx.GetPlace()); beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(beta2_pow_out, beta2); FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
} else { beta2_pow = &beta2_pow_tmp;
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
} }
const Tensor* beta1_tensor = nullptr; const Tensor* beta1_tensor = nullptr;
...@@ -173,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -173,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel<T> {
*mom2, ctx.GetPlace(), *mom2, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom2_out); ctx.template device_context<platform::DeviceContext>(), mom2_out);
} }
auto runner_m1 = if (!use_global_beta_pow) {
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {}); beta1_pow_out->mutable_data<T>(ctx.GetPlace());
runner_m1.Run(stream); beta2_pow_out->mutable_data<T>(ctx.GetPlace());
auto runner_m2 = auto runner_m1 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {}); NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m2.Run(stream); runner_m1.Run(stream);
auto runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream);
}
} }
}; };
......
...@@ -73,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -73,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
"value is:%d.", "value is:%d.",
beta2_pow_out->numel())); beta2_pow_out->numel()));
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
T beta1 = static_cast<T>(ctx.Attr<float>("beta1")); T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) { if (ctx.HasInput("Beta1Tensor")) {
auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor"); auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
...@@ -111,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -111,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
mom1_out.template mutable_data<T>(ctx.GetPlace()), mom1_out.template mutable_data<T>(ctx.GetPlace()),
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom2_out.template mutable_data<T>(ctx.GetPlace()),
param_out.template mutable_data<T>(ctx.GetPlace()), param.numel()); param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());
if (!use_global_beta_pow) {
// update in cpu and then copy to xpu // update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() && if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) { beta2_pow.place() == platform::CPUPlace()) {
const T* beta1_pow_p = beta1_pow.template data<T>(); const T* beta1_pow_p = beta1_pow.template data<T>();
beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] = beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0]; beta1 * beta1_pow_p[0];
const T* beta2_pow_p = beta2_pow.template data<T>(); const T* beta2_pow_p = beta2_pow.template data<T>();
beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] = beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0]; beta2 * beta2_pow_p[0];
} else {
T cpu_beta1_pow_out_data; } else {
T cpu_beta2_pow_out_data; T cpu_beta1_pow_out_data;
memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data, T cpu_beta2_pow_out_data;
BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
beta1_pow_ptr, sizeof(T)); memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1; beta1_pow_ptr, sizeof(T));
memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()), cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
beta2_pow_ptr, sizeof(T)); memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2; beta2_pow_ptr, sizeof(T));
T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace()); cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta1_pow_out_p, platform::CPUPlace(), T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
&cpu_beta1_pow_out_data, sizeof(T)); memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), beta1_pow_out_p, platform::CPUPlace(),
beta2_pow_out_p, platform::CPUPlace(), &cpu_beta1_pow_out_data, sizeof(T));
&cpu_beta2_pow_out_data, sizeof(T)); memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta2_pow_out_p, platform::CPUPlace(),
&cpu_beta2_pow_out_data, sizeof(T));
}
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
} }
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
} else { } else {
PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument( PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
"Variable type not supported by adam_op")); "Variable type not supported by adam_op"));
......
...@@ -125,6 +125,8 @@ class Optimizer(object): ...@@ -125,6 +125,8 @@ class Optimizer(object):
# to train. These variables are called accumulators. # to train. These variables are called accumulators.
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
# global_accumulator dict, {accum_name : acc_variable, ...}
self._global_accumulators = {}
self.helper = None self.helper = None
self._opti_name_list = [] self._opti_name_list = []
self._accumulators_holder = {} self._accumulators_holder = {}
...@@ -157,6 +159,8 @@ class Optimizer(object): ...@@ -157,6 +159,8 @@ class Optimizer(object):
for k, v in self._accumulators.items(): for k, v in self._accumulators.items():
for para_name, var_tmp in v.items(): for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp state_dict[var_tmp.name] = var_tmp
for k, v in self._global_accumulators.items():
state_dict[v.name] = v
# global step if use lr decay # global step if use lr decay
if isinstance(self._learning_rate, LRScheduler): if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict() state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
...@@ -236,36 +240,42 @@ class Optimizer(object): ...@@ -236,36 +240,42 @@ class Optimizer(object):
"Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ", "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ",
type(global_step)) type(global_step))
def _load_state_para(state_dict, param):
var = param.value()
tensor = var.get_tensor()
model_np = np.array(tensor)
load_para = state_dict[param.name]
if isinstance(load_para, Variable):
load_para_np = load_para.numpy()
elif isinstance(load_para, core.VarBase):
load_para_np = load_para.numpy()
elif isinstance(load_para, np.ndarray):
load_para_np = load_para
else:
raise RuntimeError("State dict type {} not supprt".format(
str(type(load_para))))
assert model_np.shape == load_para_np.shape, \
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
item.name, model_np.shape, load_para_np.shape)
assert model_np.dtype == load_para_np.dtype, \
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
item.name, model_np.dtype, load_para_np.dtype)
tensor.set(load_para_np, framework._current_expected_place())
self._accumulators_holder = state_dict self._accumulators_holder = state_dict
for k, v in self._accumulators.items(): for k, v in self._accumulators.items():
for para_name, var_tmp in v.items(): for para_name, var_tmp in v.items():
assert var_tmp.name in state_dict, \ assert var_tmp.name in state_dict, \
"optimizer variable {} not found".format( var_tmp.name ) "optimizer variable {} not found".format( var_tmp.name )
var = var_tmp.value() _load_state_para(state_dict, var_tmp)
tensor = var.get_tensor()
model_np = np.array(tensor)
load_para = state_dict[var_tmp.name]
if isinstance(load_para, Variable):
load_para_np = load_para.numpy()
elif isinstance(load_para, core.VarBase):
load_para_np = load_para.numpy()
elif isinstance(load_para, np.ndarray):
load_para_np = load_para
else:
raise RuntimeError("State dict type {} not supprt".format(
str(type(load_para))))
assert model_np.shape == load_para_np.shape, \
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
item.name, model_np.shape, load_para_np.shape)
assert model_np.dtype == load_para_np.dtype, \
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
item.name, model_np.dtype, load_para_np.dtype)
tensor.set(load_para_np, framework._current_expected_place()) for k, v in self._global_accumulators.items():
assert v.name in state_dict, \
"optimizer variable {} not found".format( v.name )
_load_state_para(state_dict, v)
# [aliases] Compatible with old method names # [aliases] Compatible with old method names
set_dict = set_state_dict set_dict = set_state_dict
...@@ -589,6 +599,60 @@ class Optimizer(object): ...@@ -589,6 +599,60 @@ class Optimizer(object):
self._accumulators[name][param.name] = var self._accumulators[name][param.name] = var
return var return var
def _add_global_accumulator(self,
name,
dtype=None,
fill_value=0.0,
shape=None,
type=None,
device=None):
"""Utility function to add a global accumulator for all parameters in the model
Args:
block: the block in which the loss variable is present
name: name of the accumulator
dtype: data type of the accumulator variable
fill_value: value to initialize the accumulator variable
shape: the shape of the accumulator
type: the variable type of the accumulator
device: the target place of the accumulator
"""
if self._name is not None:
name = self._name + "_" + name
if (name in self._global_accumulators):
if framework.in_dygraph_mode():
return self._global_accumulators[name]
raise Exception("Global accumulator {} already exists".format(name))
if shape == None:
shape = [1] # most case, global accumulator is of shape [1]
assert isinstance(self.helper, LayerHelper)
var_name = name
var_name = unique_name.generate(var_name)
self._opti_name_list.append(var_name)
var = self.helper.create_global_variable(
name=var_name,
persistable=True,
dtype=dtype if dtype else self._dtype,
type=type,
shape=shape,
belong_to_optimizer=True)
if device is None:
device = 'cpu'
with device_guard(device):
self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value)))
if framework.in_dygraph_mode():
if len(self._accumulators_holder) > 0:
assert var_name in self._accumulators_holder, \
"Optimizer set error, {} should in state dict".format( var_name )
var.set_value(self._accumulators_holder[var_name])
self._global_accumulators[name] = var
return var
def _get_accumulator(self, name, param): def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter """Utility function to fetch an accumulator for a parameter
...@@ -597,7 +661,7 @@ class Optimizer(object): ...@@ -597,7 +661,7 @@ class Optimizer(object):
param: parameter variable for which accumulator is to be fetched param: parameter variable for which accumulator is to be fetched
Returns: Returns:
accumulator variable for the parameter accumulator variable
""" """
if self._name is not None: if self._name is not None:
name = self._name + "_" + name name = self._name + "_" + name
...@@ -607,6 +671,21 @@ class Optimizer(object): ...@@ -607,6 +671,21 @@ class Optimizer(object):
format(name, param.name)) format(name, param.name))
return self._accumulators[name][param.name] return self._accumulators[name][param.name]
def _get_global_accumulator(self, name):
"""Utility function to fetch a global accumulator
Args:
name: name of the accumulator
Returns:
accumulator variable
"""
if self._name is not None:
name = self._name + "_" + name
if (name not in self._global_accumulators):
raise Exception("Global accumulator {} does not exist".format(name))
return self._global_accumulators[name]
def _update_param_device_map(self, parameters_and_grads, target_block): def _update_param_device_map(self, parameters_and_grads, target_block):
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[0].trainable is True: if param_and_grad[0].trainable is True:
...@@ -1915,6 +1994,8 @@ class AdamOptimizer(Optimizer): ...@@ -1915,6 +1994,8 @@ class AdamOptimizer(Optimizer):
gradient in current mini-batch, so it will be much more faster. But this mode has gradient in current mini-batch, so it will be much more faster. But this mode has
different semantics with the original Adam algorithm and may lead to different result. different semantics with the original Adam algorithm and may lead to different result.
The default value is False. The default value is False.
use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow
for whole model instead of creating beta_pow for each parameter. Default is false.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -2024,7 +2105,8 @@ class AdamOptimizer(Optimizer): ...@@ -2024,7 +2105,8 @@ class AdamOptimizer(Optimizer):
regularization=None, regularization=None,
grad_clip=None, grad_clip=None,
name=None, name=None,
lazy_mode=False): lazy_mode=False,
use_global_beta_pow=False):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
...@@ -2040,6 +2122,7 @@ class AdamOptimizer(Optimizer): ...@@ -2040,6 +2122,7 @@ class AdamOptimizer(Optimizer):
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
self._lazy_mode = lazy_mode self._lazy_mode = lazy_mode
self._use_global_beta_pow = use_global_beta_pow
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -2048,16 +2131,30 @@ class AdamOptimizer(Optimizer): ...@@ -2048,16 +2131,30 @@ class AdamOptimizer(Optimizer):
for p in parameters: for p in parameters:
self._add_accumulator(self._moment1_acc_str, p) self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(self._moment2_acc_str, p) self._add_accumulator(self._moment2_acc_str, p)
self._add_accumulator( if not self._use_global_beta_pow:
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
if self._use_global_beta_pow:
self._add_global_accumulator(
name=self._beta1_pow_acc_str, name=self._beta1_pow_acc_str,
param=p,
fill_value=0.9 if isinstance(self._beta1, Variable) \ fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1, else self._beta1,
shape=[1], shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
self._add_accumulator( self._add_global_accumulator(
name=self._beta2_pow_acc_str, name=self._beta2_pow_acc_str,
param=p,
fill_value=0.999 if isinstance(self._beta2, Variable) \ fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2, else self._beta2,
shape=[1], shape=[1],
...@@ -2070,10 +2167,16 @@ class AdamOptimizer(Optimizer): ...@@ -2070,10 +2167,16 @@ class AdamOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str, moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0]) param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, if self._use_global_beta_pow:
param_and_grad[0]) beta1_pow_acc = self._get_global_accumulator(
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, self._beta1_pow_acc_str)
param_and_grad[0]) beta2_pow_acc = self._get_global_accumulator(
self._beta2_pow_acc_str)
else:
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
# create the adam optimize op # create the adam optimize op
...@@ -2087,7 +2190,8 @@ class AdamOptimizer(Optimizer): ...@@ -2087,7 +2190,8 @@ class AdamOptimizer(Optimizer):
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
1000, 'beta1', _beta1, 'beta2', _beta2) 1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow',
self._use_global_beta_pow)
return None return None
...@@ -2109,7 +2213,8 @@ class AdamOptimizer(Optimizer): ...@@ -2109,7 +2213,8 @@ class AdamOptimizer(Optimizer):
} }
attrs = { attrs = {
"lazy_mode": self._lazy_mode, "lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000 "min_row_size_to_use_multithread": 1000,
'use_global_beta_pow': self._use_global_beta_pow
} }
if isinstance(self._beta1, Variable): if isinstance(self._beta1, Variable):
...@@ -2134,6 +2239,43 @@ class AdamOptimizer(Optimizer): ...@@ -2134,6 +2239,43 @@ class AdamOptimizer(Optimizer):
return adam_op return adam_op
def _finish_update(self, block, parameters_and_grads):
r"""Update beta1_pow and beta2_pow accumulator
"""
assert isinstance(block, framework.Block)
if self._use_global_beta_pow:
beta1_pow_acc = self._get_global_accumulator(
self._beta1_pow_acc_str)
beta2_pow_acc = self._get_global_accumulator(
self._beta2_pow_acc_str)
with block.program._optimized_guard([]):
inputs = {"X": beta1_pow_acc}
attrs = {}
if isinstance(self._beta1, Variable):
inputs['ScaleTensor'] = self._beta1
else:
attrs['scale'] = self._beta1
block.append_op(
type="scale",
inputs=inputs,
outputs={"Out": beta1_pow_acc},
attrs=attrs,
stop_gradient=True)
inputs = {"X": beta2_pow_acc}
attrs = {}
if isinstance(self._beta2, Variable):
inputs['ScaleTensor'] = self._beta2
else:
attrs['scale'] = self._beta2
block.append_op(
type="scale",
inputs=inputs,
outputs={"Out": beta2_pow_acc},
attrs=attrs,
stop_gradient=True)
class AdamaxOptimizer(Optimizer): class AdamaxOptimizer(Optimizer):
r""" r"""
......
...@@ -134,6 +134,65 @@ class TestAdamWithEpsilonTensor(OpTest): ...@@ -134,6 +134,65 @@ class TestAdamWithEpsilonTensor(OpTest):
self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False) self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestAdamOpWithGlobalBetaPow(OpTest):
def setUp(self):
self.set_npu()
self.place = paddle.NPUPlace(0)
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
# The second moment is positive
moment2 = np.random.random((102, 105)).astype("float32")
learning_rate = 0.004
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
beta1_pow = beta1**10
beta2_pow = beta2**10
self.inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
'Beta1Tensor': np.array([beta1]).astype("float32"),
'Beta2Tensor': np.array([beta2]).astype("float32"),
'EpsilonTensor': np.array([epsilon]).astype("float32"),
}
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
self.attrs = {'use_global_beta_pow': True}
# use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([]),
'Beta2PowOut': np.array([])
}
def set_npu(self):
self.__class__.use_npu = True
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(), @unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU") "core is not compiled with NPU")
class TestNet(unittest.TestCase): class TestNet(unittest.TestCase):
......
...@@ -1087,6 +1087,7 @@ class OpTest(unittest.TestCase): ...@@ -1087,6 +1087,7 @@ class OpTest(unittest.TestCase):
dygraph_outs = self._calc_dygraph_output( dygraph_outs = self._calc_dygraph_output(
place, no_check_set=no_check_set) place, no_check_set=no_check_set)
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs: if out_name not in self.outputs:
continue continue
...@@ -1177,6 +1178,11 @@ class OpTest(unittest.TestCase): ...@@ -1177,6 +1178,11 @@ class OpTest(unittest.TestCase):
actual_t = convert_uint16_to_float(actual_t) actual_t = convert_uint16_to_float(actual_t)
atol = 0.03 atol = 0.03
# NOTE(zhiqiu): np.allclose([], [1.]) returns True
# see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
if expect_t.size == 0:
self.assertTrue(actual_t.size == 0)
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
......
...@@ -404,7 +404,7 @@ class TestAdamOpBetaVariable(OpTest): ...@@ -404,7 +404,7 @@ class TestAdamOpBetaVariable(OpTest):
class TestAdamOpBetaEpsilonVariable(OpTest): class TestAdamOpBetaEpsilonVariable(OpTest):
def setUp(self): def setUp(self):
'''Test Adam Op with beta as Variable '''Test Adam Op with beta/epsilon as Variable
''' '''
self.op_type = "adam" self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32") param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
...@@ -450,6 +450,57 @@ class TestAdamOpBetaEpsilonVariable(OpTest): ...@@ -450,6 +450,57 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
self.check_output() self.check_output()
class TestAdamOpWithGlobalBetaPow(OpTest):
def setUp(self):
'''Test Adam Op with global_beta_pow
'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
# The second moment is positive
moment2 = np.random.random((102, 105)).astype("float32")
beta1 = 0.85
beta2 = 0.95
learning_rate = 0.001
epsilon = 1e-8
beta1_pow = beta1**10
beta2_pow = beta2**10
self.inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
"Beta1Tensor": np.array([beta1]).astype("float32"),
"Beta2Tensor": np.array([beta2]).astype("float32"),
"EpsilonTensor": np.array([epsilon]).astype("float32"),
}
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
self.attrs = {'use_global_beta_pow': True}
# use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([]),
'Beta2PowOut': np.array([])
}
def test_check_output(self):
self.check_output()
class TestAdamOpV2(unittest.TestCase): class TestAdamOpV2(unittest.TestCase):
def test_adam_op(self): def test_adam_op(self):
place = fluid.CPUPlace() place = fluid.CPUPlace()
...@@ -493,6 +544,7 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -493,6 +544,7 @@ class TestAdamOpV2(unittest.TestCase):
out.backward() out.backward()
adam.step() adam.step()
adam.clear_gradients() adam.clear_gradients()
paddle.enable_static()
def test_adam_op_with_state_dict(self): def test_adam_op_with_state_dict(self):
...@@ -523,6 +575,7 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -523,6 +575,7 @@ class TestAdamOpV2(unittest.TestCase):
params = adam.get_opti_var_name_list() params = adam.get_opti_var_name_list()
assert (params is not None) assert (params is not None)
paddle.enable_static()
def test_adam_with_grad_clip(self): def test_adam_with_grad_clip(self):
paddle.disable_static() paddle.disable_static()
...@@ -536,6 +589,7 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -536,6 +589,7 @@ class TestAdamOpV2(unittest.TestCase):
out.backward() out.backward()
adam.step() adam.step()
adam.clear_gradients() adam.clear_gradients()
paddle.enable_static()
def test_adam_op_with_set_lr(self): def test_adam_op_with_set_lr(self):
paddle.disable_static() paddle.disable_static()
...@@ -550,6 +604,7 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -550,6 +604,7 @@ class TestAdamOpV2(unittest.TestCase):
lr_var = paddle.fluid.layers.create_global_var( lr_var = paddle.fluid.layers.create_global_var(
shape=[1], value=lr, dtype='float32') shape=[1], value=lr, dtype='float32')
adam.set_lr(lr_var) adam.set_lr(lr_var)
paddle.enable_static()
def test_adam_op_invalid_input(self): def test_adam_op_invalid_input(self):
paddle.disable_static() paddle.disable_static()
...@@ -563,6 +618,7 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -563,6 +618,7 @@ class TestAdamOpV2(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
0.1, epsilon=-1, parameters=linear.parameters()) 0.1, epsilon=-1, parameters=linear.parameters())
paddle.enable_static()
def test_adam_op_with_sparse_input_and_weight_decay(self): def test_adam_op_with_sparse_input_and_weight_decay(self):
...@@ -577,10 +633,15 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -577,10 +633,15 @@ class TestAdamOpV2(unittest.TestCase):
out = emb(x) out = emb(x)
out.backward() out.backward()
adam.step() adam.step()
paddle.enable_static()
class TestNetWithEpsilonTensor(unittest.TestCase): class TestNetWithEpsilonTensor(unittest.TestCase):
def _test(self, place, use_tensor=True, use_fluid_api=True): def _test(self,
place,
use_tensor=True,
use_fluid_api=True,
use_global_beta_pow=False):
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
startup_prog = paddle.static.Program() startup_prog = paddle.static.Program()
...@@ -633,7 +694,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -633,7 +694,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
learning_rate=0.01, learning_rate=0.01,
beta1=beta1, beta1=beta1,
beta2=beta2, beta2=beta2,
epsilon=epsilon) epsilon=epsilon,
use_global_beta_pow=use_global_beta_pow)
else: else:
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=0.01, learning_rate=0.01,
...@@ -646,7 +708,9 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -646,7 +708,9 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
learning_rate=0.01, learning_rate=0.01,
beta1=beta1_init, beta1=beta1_init,
beta2=beta2_init, beta2=beta2_init,
epsilon=epsilon_init) epsilon=epsilon_init,
use_global_beta_pow=use_global_beta_pow,
name='a')
else: else:
adam = fluid.optimizer.Adam( adam = fluid.optimizer.Adam(
learning_rate=0.01, learning_rate=0.01,
...@@ -680,9 +744,11 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -680,9 +744,11 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
for use_tensor in [True, False]: for use_tensor in [True, False]:
for use_fluid_api in [True, False]: for use_fluid_api in [True, False]:
pred, loss = self._test(place, use_tensor, use_fluid_api) for use_global_beta_pow in [True, False]:
preds.append(pred) pred, loss = self._test(place, use_tensor, use_fluid_api,
losses.append(loss) use_global_beta_pow)
preds.append(pred)
losses.append(loss)
for pred in preds: for pred in preds:
self.assertTrue(np.allclose(pred, preds[0])) self.assertTrue(np.allclose(pred, preds[0]))
for loss in losses: for loss in losses:
...@@ -694,6 +760,55 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -694,6 +760,55 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self._test_with_place(paddle.CUDAPlace(0)) self._test_with_place(paddle.CUDAPlace(0))
def test_adam_exception(self):
paddle.enable_static()
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
label = paddle.static.data(name="label", shape=[32, 1], dtype='int64')
sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
adam = fluid.optimizer.Adam(use_global_beta_pow=True)
adam.minimize(loss)
self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
adam._add_global_accumulator(
'tmp', type=core.VarDesc.VarType.LOD_TENSOR)
adam._get_global_accumulator('tmp')
self.assertRaises(
Exception,
adam._add_global_accumulator,
adam._beta1_pow_acc_str,
type=core.VarDesc.VarType.LOD_TENSOR)
paddle.disable_static()
def test_adam_save_load(self):
paddle.disable_static()
a = paddle.rand([4, 10])
linear = paddle.nn.Linear(10, 10)
b = linear(a)
state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.fluid.optimizer.Adam(
learning_rate=scheduler,
parameter_list=linear.parameters(),
use_global_beta_pow=True)
adam.minimize(b)
state_dict = adam.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
adam.set_state_dict(opti_state_dict)
paddle.enable_static()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册