未验证 提交 c269a160 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] flatten params and grads, fuse grad_clip and optimizer op (#33461)

* enable npu alignment

* support flatten_params/grads

* support clip by global norm

* remove memset in coalesce_tensor_op

* fix npu kernel of sum op when input is one tensor

* add ut for flatten_param_grads+regularizer

* fix ut

* fix typo
上级 fa821ef9
...@@ -60,6 +60,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -60,6 +60,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
<< dst_place; << dst_place;
return; return;
} }
VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
auto size = src.layout() == DataLayout::kMKLDNN auto size = src.layout() == DataLayout::kMKLDNN
......
...@@ -30,6 +30,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -30,6 +30,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace, platform::CPUPlace,
const void* src, size_t num) { const void* src, size_t num) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
......
...@@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input"); auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
bool use_align = context.Attr<bool>("use_align"); bool use_align = context.Attr<bool>("use_align");
auto align_size = context.Attr<int>("align_size");
if (context.Attr<bool>("check_name")) { if (context.Attr<bool>("check_name")) {
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
...@@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
context.Attr<int>("dtype")); context.Attr<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype); size_t size_of_dtype = framework::SizeOfType(dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype, GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
context.GetPlace(), use_align); context.GetPlace(), use_align, align_size);
// Alloc the continuous space // Alloc the continuous space
auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput"); auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
...@@ -113,9 +114,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -113,9 +114,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor); &sub_tensor);
offset += offset += use_align
use_align ? platform::Alignment(len * size_of_dtype,
? platform::Alignment(len * size_of_dtype, context.GetPlace()) / context.GetPlace(), align_size) /
size_of_dtype size_of_dtype
: len; : len;
} }
...@@ -134,9 +135,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -134,9 +135,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor); &sub_tensor);
} }
offset += offset += use_align
use_align ? platform::Alignment(len * size_of_dtype,
? platform::Alignment(len * size_of_dtype, context.GetPlace()) / context.GetPlace(), align_size) /
size_of_dtype size_of_dtype
: len; : len;
} }
...@@ -146,28 +147,24 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -146,28 +147,24 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
offset = 0; offset = 0;
std::stringstream ss; std::stringstream ss;
ss << "alloc_space_for_vars: "; ss << "alloc_space_for_vars: ";
#if defined(PADDLE_WITH_ASCEND_CL)
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
platform::NPUMemsetAsync(
static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
0.0, fused_tensor->numel() * sizeof(T), stream);
#endif
for (size_t i = 0; i < out_tensors.size(); ++i) { for (size_t i = 0; i < out_tensors.size(); ++i) {
size_t len = static_cast<size_t>(out_tensors[i]->numel()); size_t len = static_cast<size_t>(out_tensors[i]->numel());
auto dim = out_tensors[i]->dims(); auto dim = out_tensors[i]->dims();
VLOG(4) << len << " " << dim << " " << offset;
out_tensors[i] out_tensors[i]
->ShareDataWith(fused_tensor->Slice( ->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len))) static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim); .Resize(dim);
len = use_align len = use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) / ? platform::Alignment(len * size_of_dtype, context.GetPlace(),
align_size) /
size_of_dtype size_of_dtype
: len; : len;
offset += len;
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")" ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
<< " address: " << out_tensors[i]->data<void>() << ", "; << " address: " << out_tensors[i]->data<void>() << " len: " << len
<< ", ";
offset += len;
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(int64_t)offset, fused_tensor->numel(), (int64_t)offset, fused_tensor->numel(),
...@@ -183,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -183,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
const std::vector<const framework::LoDTensor *> &lod_tensors, const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel, const std::vector<std::string> var_names, size_t *numel,
const size_t &size_of_dtype, const platform::Place &place, const size_t &size_of_dtype, const platform::Place &place,
const bool use_align = true) const { const bool use_align = true, const int align_size = -1) const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
lod_tensors.size(), var_names.size(), lod_tensors.size(), var_names.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -203,15 +200,18 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -203,15 +200,18 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
size, 0, size, 0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The number of tensor `%s`'s elements is 0.", var_names[i])); "The number of tensor `%s`'s elements is 0.", var_names[i]));
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() auto len =
<< ") " use_align
<< " addres:" << lod_tensors[i]->data<void>() << ", "; ? platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
place, align_size) /
*numel += use_align
? platform::Alignment(
static_cast<size_t>(size) * size_of_dtype, place) /
size_of_dtype size_of_dtype
: static_cast<size_t>(size); : static_cast<size_t>(size);
VLOG(4) << size << " " << len;
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< ") "
<< " addres:" << lod_tensors[i]->data<void>() << " len: " << len
<< ", ";
*numel += len;
} }
VLOG(10) << ss.str(); VLOG(10) << ss.str();
} }
...@@ -221,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel { ...@@ -221,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {} void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->IsRuntime()) {
return;
}
auto use_align = ctx->Attrs().Get<bool>("use_align");
auto align_size = ctx->Attrs().Get<int>("align_size");
auto dtype = static_cast<framework::proto::VarType::Type>(
ctx->Attrs().Get<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);
auto alignment = [](size_t size, size_t align_size) {
size_t remaining = size % align_size;
auto aligned_size =
remaining == 0 ? size : size + (align_size - remaining);
VLOG(4) << remaining << " " << size << " " << align_size << " "
<< aligned_size;
return aligned_size;
};
VLOG(4) << "align_size: " << align_size;
if (use_align && align_size > 0) {
int64_t numel = 0;
auto dims = ctx->GetInputsDim("Input");
for (const auto &dim : dims) {
auto size = framework::product(dim);
auto len = use_align
? alignment(static_cast<size_t>(size) * size_of_dtype,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
numel += len;
}
ctx->SetOutputDim("FusedOutput", framework::make_ddim({numel}));
VLOG(4) << "FusedOutput size:" << framework::make_ddim({numel});
}
}
protected: protected:
framework::OpKernelType GetKernelTypeForVar( framework::OpKernelType GetKernelTypeForVar(
...@@ -271,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -271,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
"Whether to consider memory chunk and take alignment into " "Whether to consider memory chunk and take alignment into "
"account for inputs and outputs.") "account for inputs and outputs.")
.SetDefault(true); .SetDefault(true);
AddAttr<int>("align_size", "The alignment size when use_align is True")
.SetDefault(-1);
AddComment(R"DOC( AddComment(R"DOC(
CoalesceTensor Operator. CoalesceTensor Operator.
...@@ -314,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -314,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL(
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>); ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
coalesce_tensor, coalesce_tensor,
...@@ -343,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor) ...@@ -343,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor)
"In order to optionally take memory alignment into account when " "In order to optionally take memory alignment into account when "
"coalescing tensors. The default value is true to be compatible " "coalescing tensors. The default value is true to be compatible "
"with before.", "with before.",
true)); true))
.AddCheckpoint(
R"ROC(
Upgrade coalesce_tensor: add a new attribute [align_size].)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"align_size",
"In order to optionally take memory alignment into account when "
"coalescing tensors. The default value is -1 and use the default "
"align_size "
"of each place to be compatible with before.",
-1));
...@@ -35,9 +35,11 @@ class SumNPUKernel : public framework::OpKernel<T> { ...@@ -35,9 +35,11 @@ class SumNPUKernel : public framework::OpKernel<T> {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
int n = static_cast<int>(x.size()); int n = static_cast<int>(x.size());
PADDLE_ENFORCE_EQ(n > 1, true,
platform::errors::InvalidArgument( if (n == 1) {
"The size of Input(x) list must larger or equal 2")); TensorCopy(*x[0], place, out);
return;
}
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
......
...@@ -16,23 +16,27 @@ limitations under the License. */ ...@@ -16,23 +16,27 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
size_t Alignment(size_t size, const platform::Place &place) { size_t Alignment(size_t size, const platform::Place &place, int align_size) {
size_t alignment = 1024; size_t alignment = 0;
if (align_size > 0) {
alignment = align_size;
} else {
alignment = 1024;
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize(); alignment = CpuMinChunkSize();
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
alignment = GpuMinChunkSize(); alignment = GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU) #elif defined(PADDLE_WITH_XPU)
// TODO(wangxi): add XpuMinChunkSize
alignment = alignment; alignment = alignment;
#elif defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_ASCEND_CL)
alignment = NPUMinChunkSize(); alignment = NPUMinChunkSize();
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA or NPU.")); "Fluid is not compiled with CUDA/XPU/NPU."));
#endif #endif
} }
}
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
} }
......
...@@ -22,9 +22,13 @@ limitations under the License. */ ...@@ -22,9 +22,13 @@ limitations under the License. */
#elif defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/npu_info.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/npu_info.h"
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
size_t Alignment(size_t size, const platform::Place &place); size_t Alignment(size_t size, const platform::Place &place,
int align_size = -1);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -28,7 +28,7 @@ from . import framework ...@@ -28,7 +28,7 @@ from . import framework
from . import layers from . import layers
from . import unique_name from . import unique_name
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm
from .framework import program_guard from .framework import program_guard
from .initializer import Constant from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
...@@ -42,6 +42,7 @@ from functools import reduce ...@@ -42,6 +42,7 @@ from functools import reduce
from functools import cmp_to_key from functools import cmp_to_key
from .wrapped_decorator import signature_safe_contextmanager from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt from .. import compat as cpt
import warnings
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
...@@ -68,7 +69,15 @@ class Optimizer(object): ...@@ -68,7 +69,15 @@ class Optimizer(object):
parameter_list=None, parameter_list=None,
regularization=None, regularization=None,
grad_clip=None, grad_clip=None,
flatten_param_grads=False,
align_size=-1,
name=None): name=None):
"""
Args:
flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads.
If true, the parameters and gradients will be coalesce to contiguous mempry,
and the grad_clip ops / optimizer ops will be fuse to one operator.
"""
# Because of the loop import, so place it in the function body # Because of the loop import, so place it in the function body
from paddle.optimizer.lr import LRScheduler from paddle.optimizer.lr import LRScheduler
self._parameter_list = list( self._parameter_list = list(
...@@ -107,6 +116,8 @@ class Optimizer(object): ...@@ -107,6 +116,8 @@ class Optimizer(object):
self.regularization = regularization self.regularization = regularization
self._grad_clip = grad_clip self._grad_clip = grad_clip
self._learning_rate = learning_rate self._learning_rate = learning_rate
self._flatten_param_grads = flatten_param_grads
self._align_size = align_size
self._dtype = None self._dtype = None
# Infer the dtype form parameter # Infer the dtype form parameter
...@@ -126,7 +137,7 @@ class Optimizer(object): ...@@ -126,7 +137,7 @@ class Optimizer(object):
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
# global_accumulator dict, {accum_name : acc_variable, ...} # global_accumulator dict, {accum_name : acc_variable, ...}
self._global_accumulators = {} self._global_accumulators = {}
self.helper = None self.helper = LayerHelper(self.__class__.__name__)
self._opti_name_list = [] self._opti_name_list = []
self._accumulators_holder = {} self._accumulators_holder = {}
self._param_device_map = dict() self._param_device_map = dict()
...@@ -739,7 +750,7 @@ class Optimizer(object): ...@@ -739,7 +750,7 @@ class Optimizer(object):
current_block.backward_block_idx] current_block.backward_block_idx]
start = len(target_block.ops) start = len(target_block.ops)
self.helper = LayerHelper(self.__class__.__name__)
self._update_param_device_map(parameters_and_grads, target_block) self._update_param_device_map(parameters_and_grads, target_block)
self._create_accumulators( self._create_accumulators(
target_block, target_block,
...@@ -958,7 +969,9 @@ class Optimizer(object): ...@@ -958,7 +969,9 @@ class Optimizer(object):
repeate_regularizer = False repeate_regularizer = False
with framework.name_scope('regularization'): with framework.name_scope('regularization'):
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
if not repeate_regularizer and param.regularizer is not None and regularization is not None: if not repeate_regularizer and getattr(
param, 'regularizer',
None) is not None and regularization is not None:
repeate_regularizer = True repeate_regularizer = True
logging.info( logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
...@@ -970,6 +983,83 @@ class Optimizer(object): ...@@ -970,6 +983,83 @@ class Optimizer(object):
params_and_grads.append((param, new_grad)) params_and_grads.append((param, new_grad))
return params_and_grads return params_and_grads
def flatten_param_grads(self, params_grads):
need_flatten_params = []
need_flatten_grads = []
for p, g in params_grads:
if g is None:
continue
g.persistable = True
if getattr(p, 'need_clip', True) is False or getattr(
p, 'regularizer', None) is not None:
warnings.warn(
"flatten_param_grads=True will be discarded since paramter '{}''s need_clip is False or "
"the regularizer is set".format(p.name))
self._flatten_param_grads = False
return params_grads
need_flatten_params.append(p)
need_flatten_grads.append(g)
shape = [np.prod(p.shape) for p in need_flatten_params]
block = need_flatten_params[0].block
flatten_param = self.helper.create_global_variable(
name='flatten_param',
persistable=True,
dtype=need_flatten_params[0].dtype,
shape=[np.sum(shape)],
belong_to_optimizer=True)
flatten_param.trainable = True
flatten_param.optimize_attr = need_flatten_params[0].optimize_attr
flatten_param.regularizer = need_flatten_params[0].regularizer
flatten_grad = self.helper.create_global_variable(
name='flatten_grad',
persistable=True,
dtype=need_flatten_grads[0].dtype,
shape=[np.sum(shape)],
belong_to_optimizer=True)
with program_guard(default_main_program()):
block.append_op(
type="coalesce_tensor",
inputs={"Input": need_flatten_params},
outputs={
"Output": need_flatten_params,
"FusedOutput": flatten_param
},
attrs={
"copy_data": True,
"use_align": True,
"align_size": self._align_size,
"dtype": need_flatten_params[0].dtype
})
block.append_op(
type="coalesce_tensor",
inputs={"Input": need_flatten_grads},
outputs={
"Output": need_flatten_grads,
"FusedOutput": flatten_grad
},
attrs={
"copy_data": True,
"use_align": True,
"align_size": self._align_size,
"dtype": need_flatten_grads[0].dtype
})
#NOTE(zhiqiu): the initializer should be set after coalesce_tensor op,
# so the shape of flatten_param and flatten_grad will be inferred.
self.helper.set_variable_initializer(
flatten_param, initializer=Constant(0.0))
self.helper.set_variable_initializer(
flatten_grad, initializer=Constant(0.0))
return [(flatten_param, flatten_grad)]
def apply_gradients(self, params_grads): def apply_gradients(self, params_grads):
""" """
Second part of `minimize`, appending optimization operators for Second part of `minimize`, appending optimization operators for
...@@ -992,9 +1082,14 @@ class Optimizer(object): ...@@ -992,9 +1082,14 @@ class Optimizer(object):
# ... # ...
optimizer.apply_gradients(params_grads) optimizer.apply_gradients(params_grads)
""" """
params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads = sorted(params_grads, key=lambda x: x[0].name)
# NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
if self._flatten_param_grads and self.regularization is None:
if self._grad_clip == None or isinstance(self._grad_clip,
ClipGradByGlobalNorm):
params_grads = self.flatten_param_grads(params_grads)
# 'optimizer(grad_clip)' or 'set_gradient_clip' # 'optimizer(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None: if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads) params_grads = self._grad_clip(params_grads)
...@@ -2156,6 +2251,9 @@ class AdamOptimizer(Optimizer): ...@@ -2156,6 +2251,9 @@ class AdamOptimizer(Optimizer):
The default value is False. The default value is False.
use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow
for whole model instead of creating beta_pow for each parameter. Default is false. for whole model instead of creating beta_pow for each parameter. Default is false.
flatten_param_grads (bool, optional): Whether to flatten all parameters and gradients. Default is false.
align_size (int, optional): The alignment size when flatten parameters and gradients. Default is -1, which means
use same align_size as allocator.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -2266,7 +2364,9 @@ class AdamOptimizer(Optimizer): ...@@ -2266,7 +2364,9 @@ class AdamOptimizer(Optimizer):
grad_clip=None, grad_clip=None,
name=None, name=None,
lazy_mode=False, lazy_mode=False,
use_global_beta_pow=False): use_global_beta_pow=False,
flatten_param_grads=False,
align_size=-1):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
...@@ -2276,6 +2376,8 @@ class AdamOptimizer(Optimizer): ...@@ -2276,6 +2376,8 @@ class AdamOptimizer(Optimizer):
parameter_list=parameter_list, parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
grad_clip=grad_clip, grad_clip=grad_clip,
flatten_param_grads=flatten_param_grads,
align_size=align_size,
name=name) name=name)
self.type = "adam" self.type = "adam"
self._beta1 = beta1 self._beta1 = beta1
......
...@@ -82,5 +82,30 @@ class TestSum2(OpTest): ...@@ -82,5 +82,30 @@ class TestSum2(OpTest):
self.check_output_with_place(self.place, check_dygraph=False) self.check_output_with_place(self.place, check_dygraph=False)
class TestSum3(OpTest):
def setUp(self):
self.set_npu()
self.init_dtype()
self.op_type = "sum"
self.place = paddle.NPUPlace(0)
x0 = np.random.random((3, 3)).astype(self.dtype)
self.inputs = {'X': [("x0", x0)]}
y = x0
self.outputs = {'Out': y}
self.attrs = {'use_mkldnn': False}
def init_dtype(self):
self.dtype = np.float16
def set_npu(self):
self.__class__.use_npu = True
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -636,12 +636,13 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -636,12 +636,13 @@ class TestAdamOpV2(unittest.TestCase):
paddle.enable_static() paddle.enable_static()
class TestNetWithEpsilonTensor(unittest.TestCase): class TestAdamOptimizer(unittest.TestCase):
def _test(self, def _test(self,
place, place,
use_tensor=True, use_tensor=True,
use_fluid_api=True, use_fluid_api=True,
use_global_beta_pow=False): use_global_beta_pow=False,
flatten_param_grads=False):
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
startup_prog = paddle.static.Program() startup_prog = paddle.static.Program()
...@@ -649,21 +650,34 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -649,21 +650,34 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
paddle.seed(SEED) paddle.seed(SEED)
np.random.seed(SEED) np.random.seed(SEED)
a_np = np.random.random(size=(32, 32)).astype('float32') a_np = np.random.random(size=(2, 2)).astype('float32')
b_np = np.random.random(size=(32, 32)).astype('float32') b_np = np.random.random(size=(2, 2)).astype('float32')
label_np = np.random.randint(2, size=(32, 1)).astype('int64') label_np = np.random.randint(2, size=(2, 1)).astype('int64')
weight_attr1 = paddle.ParamAttr(
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
trainable=True)
weight_attr2 = paddle.ParamAttr(
name="weight2",
initializer=fluid.initializer.Constant(value=2.0),
trainable=True)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
with paddle.static.program_guard(main_prog, startup_prog): with paddle.static.program_guard(main_prog, startup_prog):
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') with paddle.utils.unique_name.guard():
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
label = paddle.static.data( label = paddle.static.data(
name="label", shape=[32, 1], dtype='int64') name="label", shape=[2, 1], dtype='int64')
sum = paddle.add(a, b) sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0) z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=128) fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') prediction = fluid.layers.fc(input=fc_1,
size=2,
param_attr=weight_attr2,
act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost) loss = fluid.layers.reduce_mean(cost)
...@@ -695,13 +709,17 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -695,13 +709,17 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
beta1=beta1, beta1=beta1,
beta2=beta2, beta2=beta2,
epsilon=epsilon, epsilon=epsilon,
use_global_beta_pow=use_global_beta_pow) use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
else: else:
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=0.01, learning_rate=0.01,
beta1=beta1, beta1=beta1,
beta2=beta2, beta2=beta2,
epsilon=epsilon) epsilon=epsilon,
grad_clip=clip)
else: else:
if use_fluid_api: if use_fluid_api:
adam = fluid.optimizer.Adam( adam = fluid.optimizer.Adam(
...@@ -710,31 +728,34 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -710,31 +728,34 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
beta2=beta2_init, beta2=beta2_init,
epsilon=epsilon_init, epsilon=epsilon_init,
use_global_beta_pow=use_global_beta_pow, use_global_beta_pow=use_global_beta_pow,
name='a') flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
else: else:
adam = fluid.optimizer.Adam( adam = fluid.optimizer.Adam(
learning_rate=0.01, learning_rate=0.01,
beta1=beta1_init, beta1=beta1_init,
beta2=beta2_init, beta2=beta2_init,
epsilon=epsilon_init) epsilon=epsilon_init,
grad_clip=clip)
adam.minimize(loss) adam.minimize(loss)
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe = paddle.static.Executor(place) exe = paddle.static.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
print("Start run on {}".format(place)) print("Start run on {}".format(place))
for epoch in range(10): for epoch in range(10):
pred_res, loss_res = exe.run( pred_res, loss_res = exe.run(
main_prog, main_prog,
feed={"a": a_np, feed={"a": a_np,
"b": b_np, "b": b_np,
"label": label_np}, "label": label_np},
fetch_list=[prediction, loss]) fetch_list=[prediction, loss])
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(epoch, pred_res[ epoch, pred_res[0], loss_res))
0], loss_res))
paddle.disable_static() paddle.disable_static()
return pred_res, loss_res return pred_res, loss_res
...@@ -745,8 +766,10 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -745,8 +766,10 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
for use_tensor in [True, False]: for use_tensor in [True, False]:
for use_fluid_api in [True, False]: for use_fluid_api in [True, False]:
for use_global_beta_pow in [True, False]: for use_global_beta_pow in [True, False]:
pred, loss = self._test(place, use_tensor, use_fluid_api, for flatten_param_grads in [True, False]:
use_global_beta_pow) pred, loss = self._test(
place, use_tensor, use_fluid_api,
use_global_beta_pow, flatten_param_grads)
preds.append(pred) preds.append(pred)
losses.append(loss) losses.append(loss)
for pred in preds: for pred in preds:
...@@ -760,6 +783,33 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -760,6 +783,33 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self._test_with_place(paddle.CUDAPlace(0)) self._test_with_place(paddle.CUDAPlace(0))
def test_adam_flatten_param_grads_with_regularizer(self):
# flatten_param_grads + regularizer is not supported yet.
paddle.enable_static()
main = fluid.Program()
weight_attr = paddle.ParamAttr(
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
regularizer=fluid.regularizer.L1DecayRegularizer(
regularization_coeff=0.1),
trainable=True)
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1,
act=None,
param_attr=weight_attr)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
adam = fluid.optimizer.AdamOptimizer(
0.01, flatten_param_grads=True, align_size=256)
adam.minimize(avg_cost)
paddle.disable_static()
self.assertEqual(adam._flatten_param_grads, False)
def test_adam_exception(self): def test_adam_exception(self):
paddle.enable_static() paddle.enable_static()
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册