未验证 提交 dace3fd5 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] support global accumulator for adam (#32780)

* add use_global_beta_pow

* add use_global_beta_pow

* update npu kernel

* update python api

* refine code

* add ut for use_global_beta_pow

* fix npu kernel

* add ut for api

* add ut for exception

* add ut for save/load
上级 48fc16f2
......@@ -198,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
// TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
// as dispensable since they are not used when use_global_beta_pow is true.
AddAttr<bool>("use_global_beta_pow",
"(bool, default false) "
"Whether to use global beta_pow for whole model instead of "
"creating beta_pow for each parameter.")
.SetDefault(false);
AddComment(R"DOC(
Adam Optimizer.
......@@ -246,4 +253,16 @@ REGISTER_OP_VERSION(adam)
"EpsilonTensor",
"If provided, Adam will use this as epsilon, "
"this has a higher priority than attr(epsilon). "
"For better performance in npu kernel. "));
"For better performance in npu kernel. "))
.AddCheckpoint(
R"ROC(
Upgrade adam, add 1 attribute [use_global_beta_pow].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"use_global_beta_pow",
"If true, Adam will use global beta_pow for whole model "
"instead of creating beta_pow for each parameter."
"In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
"used in adam op, "
"and beta_pow will be updated after all adam op in the model.",
false));
......@@ -154,6 +154,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
......@@ -254,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
// Cpu update
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
if (!use_global_beta_pow) {
// Cpu update
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
......@@ -269,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
// Update with gpu
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
if (!use_global_beta_pow) {
// Update with gpu
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
if (grad->rows().size() == 0) {
......@@ -328,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, rows, row_numel, grad_merge.rows().size(),
lazy_mode, ndim);
// Update with cpu
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
if (!use_global_beta_pow) {
// Update with cpu
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
SparseAdamFunctor<T, GPUAdam, MPDType> functor(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
......@@ -351,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
ctx.device_context()),
param->numel());
for_range(functor);
// update beta1 and beta2
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
if (!use_global_beta_pow) {
// update beta1 and beta2
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
......
......@@ -406,6 +406,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
......@@ -475,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
lr->data<T>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()));
functor(param->numel());
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
if (grad->rows().size() == 0) {
......@@ -523,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size(), lazy_mode);
// update beta1 and beta2
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
}
if (lazy_mode) {
VLOG(3) << "run cpu lazy mode";
size_t row_count = grad_merge.rows().size();
......
......@@ -49,8 +49,8 @@ class AdamNPUKernel : public framework::OpKernel<T> {
auto* mom2 = ctx.Input<LoDTensor>("Moment2");
auto* lr = ctx.Input<LoDTensor>("LearningRate");
auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
auto* param_out = ctx.Output<LoDTensor>("ParamOut");
auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
......@@ -58,25 +58,28 @@ class AdamNPUKernel : public framework::OpKernel<T> {
auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
param_out->mutable_data<T>(ctx.GetPlace());
mom1_out->mutable_data<T>(ctx.GetPlace());
mom2_out->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
// place.
LoDTensor beta1_pow_tmp;
LoDTensor beta2_pow_tmp;
if (beta1_pow->place() == platform::CPUPlace()) {
T beta1 = *beta1_pow->data<T>();
// `mutable_data` operation needs to be done after getting data
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
} else {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
beta1_pow = &beta1_pow_tmp;
}
if (beta2_pow->place() == platform::CPUPlace()) {
T beta2 = *beta2_pow->data<T>();
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
} else {
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
beta2_pow = &beta2_pow_tmp;
}
const Tensor* beta1_tensor = nullptr;
......@@ -173,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel<T> {
*mom2, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom2_out);
}
auto runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m1.Run(stream);
auto runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream);
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
auto runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m1.Run(stream);
auto runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream);
}
}
};
......
......@@ -73,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
"value is:%d.",
beta2_pow_out->numel()));
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) {
auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
......@@ -111,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
mom1_out.template mutable_data<T>(ctx.GetPlace()),
mom2_out.template mutable_data<T>(ctx.GetPlace()),
param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());
// update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
const T* beta1_pow_p = beta1_pow.template data<T>();
beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0];
const T* beta2_pow_p = beta2_pow.template data<T>();
beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
} else {
T cpu_beta1_pow_out_data;
T cpu_beta2_pow_out_data;
memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
beta1_pow_ptr, sizeof(T));
cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
beta2_pow_ptr, sizeof(T));
cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta1_pow_out_p, platform::CPUPlace(),
&cpu_beta1_pow_out_data, sizeof(T));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta2_pow_out_p, platform::CPUPlace(),
&cpu_beta2_pow_out_data, sizeof(T));
if (!use_global_beta_pow) {
// update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
const T* beta1_pow_p = beta1_pow.template data<T>();
beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0];
const T* beta2_pow_p = beta2_pow.template data<T>();
beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
} else {
T cpu_beta1_pow_out_data;
T cpu_beta2_pow_out_data;
memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
beta1_pow_ptr, sizeof(T));
cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
beta2_pow_ptr, sizeof(T));
cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta1_pow_out_p, platform::CPUPlace(),
&cpu_beta1_pow_out_data, sizeof(T));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta2_pow_out_p, platform::CPUPlace(),
&cpu_beta2_pow_out_data, sizeof(T));
}
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
}
PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
} else {
PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
"Variable type not supported by adam_op"));
......
......@@ -125,6 +125,8 @@ class Optimizer(object):
# to train. These variables are called accumulators.
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict())
# global_accumulator dict, {accum_name : acc_variable, ...}
self._global_accumulators = {}
self.helper = None
self._opti_name_list = []
self._accumulators_holder = {}
......@@ -157,6 +159,8 @@ class Optimizer(object):
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp
for k, v in self._global_accumulators.items():
state_dict[v.name] = v
# global step if use lr decay
if isinstance(self._learning_rate, LRScheduler):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
......@@ -236,36 +240,42 @@ class Optimizer(object):
"Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ",
type(global_step))
def _load_state_para(state_dict, param):
var = param.value()
tensor = var.get_tensor()
model_np = np.array(tensor)
load_para = state_dict[param.name]
if isinstance(load_para, Variable):
load_para_np = load_para.numpy()
elif isinstance(load_para, core.VarBase):
load_para_np = load_para.numpy()
elif isinstance(load_para, np.ndarray):
load_para_np = load_para
else:
raise RuntimeError("State dict type {} not supprt".format(
str(type(load_para))))
assert model_np.shape == load_para_np.shape, \
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
item.name, model_np.shape, load_para_np.shape)
assert model_np.dtype == load_para_np.dtype, \
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
item.name, model_np.dtype, load_para_np.dtype)
tensor.set(load_para_np, framework._current_expected_place())
self._accumulators_holder = state_dict
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
assert var_tmp.name in state_dict, \
"optimizer variable {} not found".format( var_tmp.name )
var = var_tmp.value()
tensor = var.get_tensor()
model_np = np.array(tensor)
load_para = state_dict[var_tmp.name]
if isinstance(load_para, Variable):
load_para_np = load_para.numpy()
elif isinstance(load_para, core.VarBase):
load_para_np = load_para.numpy()
elif isinstance(load_para, np.ndarray):
load_para_np = load_para
else:
raise RuntimeError("State dict type {} not supprt".format(
str(type(load_para))))
assert model_np.shape == load_para_np.shape, \
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
item.name, model_np.shape, load_para_np.shape)
assert model_np.dtype == load_para_np.dtype, \
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
item.name, model_np.dtype, load_para_np.dtype)
_load_state_para(state_dict, var_tmp)
tensor.set(load_para_np, framework._current_expected_place())
for k, v in self._global_accumulators.items():
assert v.name in state_dict, \
"optimizer variable {} not found".format( v.name )
_load_state_para(state_dict, v)
# [aliases] Compatible with old method names
set_dict = set_state_dict
......@@ -589,6 +599,60 @@ class Optimizer(object):
self._accumulators[name][param.name] = var
return var
def _add_global_accumulator(self,
name,
dtype=None,
fill_value=0.0,
shape=None,
type=None,
device=None):
"""Utility function to add a global accumulator for all parameters in the model
Args:
block: the block in which the loss variable is present
name: name of the accumulator
dtype: data type of the accumulator variable
fill_value: value to initialize the accumulator variable
shape: the shape of the accumulator
type: the variable type of the accumulator
device: the target place of the accumulator
"""
if self._name is not None:
name = self._name + "_" + name
if (name in self._global_accumulators):
if framework.in_dygraph_mode():
return self._global_accumulators[name]
raise Exception("Global accumulator {} already exists".format(name))
if shape == None:
shape = [1] # most case, global accumulator is of shape [1]
assert isinstance(self.helper, LayerHelper)
var_name = name
var_name = unique_name.generate(var_name)
self._opti_name_list.append(var_name)
var = self.helper.create_global_variable(
name=var_name,
persistable=True,
dtype=dtype if dtype else self._dtype,
type=type,
shape=shape,
belong_to_optimizer=True)
if device is None:
device = 'cpu'
with device_guard(device):
self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value)))
if framework.in_dygraph_mode():
if len(self._accumulators_holder) > 0:
assert var_name in self._accumulators_holder, \
"Optimizer set error, {} should in state dict".format( var_name )
var.set_value(self._accumulators_holder[var_name])
self._global_accumulators[name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
......@@ -597,7 +661,7 @@ class Optimizer(object):
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
accumulator variable
"""
if self._name is not None:
name = self._name + "_" + name
......@@ -607,6 +671,21 @@ class Optimizer(object):
format(name, param.name))
return self._accumulators[name][param.name]
def _get_global_accumulator(self, name):
"""Utility function to fetch a global accumulator
Args:
name: name of the accumulator
Returns:
accumulator variable
"""
if self._name is not None:
name = self._name + "_" + name
if (name not in self._global_accumulators):
raise Exception("Global accumulator {} does not exist".format(name))
return self._global_accumulators[name]
def _update_param_device_map(self, parameters_and_grads, target_block):
for param_and_grad in parameters_and_grads:
if param_and_grad[0].trainable is True:
......@@ -1915,6 +1994,8 @@ class AdamOptimizer(Optimizer):
gradient in current mini-batch, so it will be much more faster. But this mode has
different semantics with the original Adam algorithm and may lead to different result.
The default value is False.
use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow
for whole model instead of creating beta_pow for each parameter. Default is false.
Examples:
.. code-block:: python
......@@ -2024,7 +2105,8 @@ class AdamOptimizer(Optimizer):
regularization=None,
grad_clip=None,
name=None,
lazy_mode=False):
lazy_mode=False,
use_global_beta_pow=False):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
......@@ -2040,6 +2122,7 @@ class AdamOptimizer(Optimizer):
self._beta2 = beta2
self._epsilon = epsilon
self._lazy_mode = lazy_mode
self._use_global_beta_pow = use_global_beta_pow
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -2048,16 +2131,30 @@ class AdamOptimizer(Optimizer):
for p in parameters:
self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(self._moment2_acc_str, p)
self._add_accumulator(
if not self._use_global_beta_pow:
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
if self._use_global_beta_pow:
self._add_global_accumulator(
name=self._beta1_pow_acc_str,
param=p,
fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
self._add_accumulator(
self._add_global_accumulator(
name=self._beta2_pow_acc_str,
param=p,
fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
shape=[1],
......@@ -2070,10 +2167,16 @@ class AdamOptimizer(Optimizer):
param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
if self._use_global_beta_pow:
beta1_pow_acc = self._get_global_accumulator(
self._beta1_pow_acc_str)
beta2_pow_acc = self._get_global_accumulator(
self._beta2_pow_acc_str)
else:
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)
# create the adam optimize op
......@@ -2087,7 +2190,8 @@ class AdamOptimizer(Optimizer):
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
1000, 'beta1', _beta1, 'beta2', _beta2)
1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow',
self._use_global_beta_pow)
return None
......@@ -2109,7 +2213,8 @@ class AdamOptimizer(Optimizer):
}
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000
"min_row_size_to_use_multithread": 1000,
'use_global_beta_pow': self._use_global_beta_pow
}
if isinstance(self._beta1, Variable):
......@@ -2134,6 +2239,43 @@ class AdamOptimizer(Optimizer):
return adam_op
def _finish_update(self, block, parameters_and_grads):
r"""Update beta1_pow and beta2_pow accumulator
"""
assert isinstance(block, framework.Block)
if self._use_global_beta_pow:
beta1_pow_acc = self._get_global_accumulator(
self._beta1_pow_acc_str)
beta2_pow_acc = self._get_global_accumulator(
self._beta2_pow_acc_str)
with block.program._optimized_guard([]):
inputs = {"X": beta1_pow_acc}
attrs = {}
if isinstance(self._beta1, Variable):
inputs['ScaleTensor'] = self._beta1
else:
attrs['scale'] = self._beta1
block.append_op(
type="scale",
inputs=inputs,
outputs={"Out": beta1_pow_acc},
attrs=attrs,
stop_gradient=True)
inputs = {"X": beta2_pow_acc}
attrs = {}
if isinstance(self._beta2, Variable):
inputs['ScaleTensor'] = self._beta2
else:
attrs['scale'] = self._beta2
block.append_op(
type="scale",
inputs=inputs,
outputs={"Out": beta2_pow_acc},
attrs=attrs,
stop_gradient=True)
class AdamaxOptimizer(Optimizer):
r"""
......
......@@ -134,6 +134,65 @@ class TestAdamWithEpsilonTensor(OpTest):
self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestAdamOpWithGlobalBetaPow(OpTest):
def setUp(self):
self.set_npu()
self.place = paddle.NPUPlace(0)
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
# The second moment is positive
moment2 = np.random.random((102, 105)).astype("float32")
learning_rate = 0.004
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
beta1_pow = beta1**10
beta2_pow = beta2**10
self.inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
'Beta1Tensor': np.array([beta1]).astype("float32"),
'Beta2Tensor': np.array([beta2]).astype("float32"),
'EpsilonTensor': np.array([epsilon]).astype("float32"),
}
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
self.attrs = {'use_global_beta_pow': True}
# use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([]),
'Beta2PowOut': np.array([])
}
def set_npu(self):
self.__class__.use_npu = True
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNet(unittest.TestCase):
......
......@@ -1087,6 +1087,7 @@ class OpTest(unittest.TestCase):
dygraph_outs = self._calc_dygraph_output(
place, no_check_set=no_check_set)
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs:
continue
......@@ -1177,6 +1178,11 @@ class OpTest(unittest.TestCase):
actual_t = convert_uint16_to_float(actual_t)
atol = 0.03
# NOTE(zhiqiu): np.allclose([], [1.]) returns True
# see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
if expect_t.size == 0:
self.assertTrue(actual_t.size == 0)
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol, equal_nan=equal_nan),
......
......@@ -404,7 +404,7 @@ class TestAdamOpBetaVariable(OpTest):
class TestAdamOpBetaEpsilonVariable(OpTest):
def setUp(self):
'''Test Adam Op with beta as Variable
'''Test Adam Op with beta/epsilon as Variable
'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -450,6 +450,57 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
self.check_output()
class TestAdamOpWithGlobalBetaPow(OpTest):
def setUp(self):
'''Test Adam Op with global_beta_pow
'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
# The second moment is positive
moment2 = np.random.random((102, 105)).astype("float32")
beta1 = 0.85
beta2 = 0.95
learning_rate = 0.001
epsilon = 1e-8
beta1_pow = beta1**10
beta2_pow = beta2**10
self.inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
"Beta1Tensor": np.array([beta1]).astype("float32"),
"Beta2Tensor": np.array([beta2]).astype("float32"),
"EpsilonTensor": np.array([epsilon]).astype("float32"),
}
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
self.attrs = {'use_global_beta_pow': True}
# use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([]),
'Beta2PowOut': np.array([])
}
def test_check_output(self):
self.check_output()
class TestAdamOpV2(unittest.TestCase):
def test_adam_op(self):
place = fluid.CPUPlace()
......@@ -493,6 +544,7 @@ class TestAdamOpV2(unittest.TestCase):
out.backward()
adam.step()
adam.clear_gradients()
paddle.enable_static()
def test_adam_op_with_state_dict(self):
......@@ -523,6 +575,7 @@ class TestAdamOpV2(unittest.TestCase):
params = adam.get_opti_var_name_list()
assert (params is not None)
paddle.enable_static()
def test_adam_with_grad_clip(self):
paddle.disable_static()
......@@ -536,6 +589,7 @@ class TestAdamOpV2(unittest.TestCase):
out.backward()
adam.step()
adam.clear_gradients()
paddle.enable_static()
def test_adam_op_with_set_lr(self):
paddle.disable_static()
......@@ -550,6 +604,7 @@ class TestAdamOpV2(unittest.TestCase):
lr_var = paddle.fluid.layers.create_global_var(
shape=[1], value=lr, dtype='float32')
adam.set_lr(lr_var)
paddle.enable_static()
def test_adam_op_invalid_input(self):
paddle.disable_static()
......@@ -563,6 +618,7 @@ class TestAdamOpV2(unittest.TestCase):
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(
0.1, epsilon=-1, parameters=linear.parameters())
paddle.enable_static()
def test_adam_op_with_sparse_input_and_weight_decay(self):
......@@ -577,10 +633,15 @@ class TestAdamOpV2(unittest.TestCase):
out = emb(x)
out.backward()
adam.step()
paddle.enable_static()
class TestNetWithEpsilonTensor(unittest.TestCase):
def _test(self, place, use_tensor=True, use_fluid_api=True):
def _test(self,
place,
use_tensor=True,
use_fluid_api=True,
use_global_beta_pow=False):
paddle.enable_static()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
......@@ -633,7 +694,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon)
epsilon=epsilon,
use_global_beta_pow=use_global_beta_pow)
else:
adam = paddle.optimizer.Adam(
learning_rate=0.01,
......@@ -646,7 +708,9 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init)
epsilon=epsilon_init,
use_global_beta_pow=use_global_beta_pow,
name='a')
else:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
......@@ -680,9 +744,11 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
for use_tensor in [True, False]:
for use_fluid_api in [True, False]:
pred, loss = self._test(place, use_tensor, use_fluid_api)
preds.append(pred)
losses.append(loss)
for use_global_beta_pow in [True, False]:
pred, loss = self._test(place, use_tensor, use_fluid_api,
use_global_beta_pow)
preds.append(pred)
losses.append(loss)
for pred in preds:
self.assertTrue(np.allclose(pred, preds[0]))
for loss in losses:
......@@ -694,6 +760,55 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
if core.is_compiled_with_cuda():
self._test_with_place(paddle.CUDAPlace(0))
def test_adam_exception(self):
paddle.enable_static()
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
label = paddle.static.data(name="label", shape=[32, 1], dtype='int64')
sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
adam = fluid.optimizer.Adam(use_global_beta_pow=True)
adam.minimize(loss)
self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
adam._add_global_accumulator(
'tmp', type=core.VarDesc.VarType.LOD_TENSOR)
adam._get_global_accumulator('tmp')
self.assertRaises(
Exception,
adam._add_global_accumulator,
adam._beta1_pow_acc_str,
type=core.VarDesc.VarType.LOD_TENSOR)
paddle.disable_static()
def test_adam_save_load(self):
paddle.disable_static()
a = paddle.rand([4, 10])
linear = paddle.nn.Linear(10, 10)
b = linear(a)
state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.fluid.optimizer.Adam(
learning_rate=scheduler,
parameter_list=linear.parameters(),
use_global_beta_pow=True)
adam.minimize(b)
state_dict = adam.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
adam.set_state_dict(opti_state_dict)
paddle.enable_static()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册