From c269a160078593d6f66eecab721870f30d3d972f Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 21 Jun 2021 11:56:18 +0800 Subject: [PATCH] [NPU] flatten params and grads, fuse grad_clip and optimizer op (#33461) * enable npu alignment * support flatten_params/grads * support clip by global norm * remove memset in coalesce_tensor_op * fix npu kernel of sum op when input is one tensor * add ut for flatten_param_grads+regularizer * fix ut * fix typo --- paddle/fluid/framework/tensor_util.cc | 1 + paddle/fluid/memory/memcpy.cc | 1 + paddle/fluid/operators/coalesce_tensor_op.cc | 121 ++++++--- paddle/fluid/operators/sum_op_npu.cc | 8 +- .../fluid/platform/device_memory_aligment.cc | 24 +- .../fluid/platform/device_memory_aligment.h | 6 +- python/paddle/fluid/optimizer.py | 114 ++++++++- .../tests/unittests/npu/test_sum_op_npu.py | 25 ++ .../fluid/tests/unittests/test_adam_op.py | 232 +++++++++++------- 9 files changed, 389 insertions(+), 143 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 32460a98ce..d8f6df3e0b 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -60,6 +60,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } + VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr; #ifdef PADDLE_WITH_MKLDNN auto size = src.layout() == DataLayout::kMKLDNN diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index a925957e1a..f2f8c5d1fb 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -30,6 +30,7 @@ void Copy(platform::CPUPlace, void* dst, platform::CPUPlace, const void* src, size_t num) { if (UNLIKELY(num == 0)) return; + VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); } diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index c1c4f14582..6ea8809dae 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { auto in_tensors = context.MultiInput("Input"); bool use_align = context.Attr("use_align"); + auto align_size = context.Attr("align_size"); if (context.Attr("check_name")) { for (size_t i = 0; i < in_var_names.size(); ++i) { @@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { context.Attr("dtype")); size_t size_of_dtype = framework::SizeOfType(dtype); GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype, - context.GetPlace(), use_align); + context.GetPlace(), use_align, align_size); // Alloc the continuous space auto fused_tensor = context.Output("FusedOutput"); @@ -113,11 +114,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel { framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); - offset += - use_align - ? platform::Alignment(len * size_of_dtype, context.GetPlace()) / - size_of_dtype - : len; + offset += use_align + ? platform::Alignment(len * size_of_dtype, + context.GetPlace(), align_size) / + size_of_dtype + : len; } } else if (context.Attr("set_constant")) { // TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION. @@ -134,11 +135,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel { framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); } - offset += - use_align - ? platform::Alignment(len * size_of_dtype, context.GetPlace()) / - size_of_dtype - : len; + offset += use_align + ? platform::Alignment(len * size_of_dtype, + context.GetPlace(), align_size) / + size_of_dtype + : len; } } @@ -146,28 +147,24 @@ class CoalesceTensorOpKernel : public framework::OpKernel { offset = 0; std::stringstream ss; ss << "alloc_space_for_vars: "; -#if defined(PADDLE_WITH_ASCEND_CL) - auto stream = - context.template device_context() - .stream(); - platform::NPUMemsetAsync( - static_cast(fused_tensor->mutable_data(dev_ctx.GetPlace())), - 0.0, fused_tensor->numel() * sizeof(T), stream); -#endif + for (size_t i = 0; i < out_tensors.size(); ++i) { size_t len = static_cast(out_tensors[i]->numel()); auto dim = out_tensors[i]->dims(); + VLOG(4) << len << " " << dim << " " << offset; out_tensors[i] ->ShareDataWith(fused_tensor->Slice( static_cast(offset), static_cast(offset + len))) .Resize(dim); len = use_align - ? platform::Alignment(len * size_of_dtype, context.GetPlace()) / + ? platform::Alignment(len * size_of_dtype, context.GetPlace(), + align_size) / size_of_dtype : len; - offset += len; ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")" - << " address: " << out_tensors[i]->data() << ", "; + << " address: " << out_tensors[i]->data() << " len: " << len + << ", "; + offset += len; } PADDLE_ENFORCE_EQ( (int64_t)offset, fused_tensor->numel(), @@ -183,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { const std::vector &lod_tensors, const std::vector var_names, size_t *numel, const size_t &size_of_dtype, const platform::Place &place, - const bool use_align = true) const { + const bool use_align = true, const int align_size = -1) const { PADDLE_ENFORCE_EQ( lod_tensors.size(), var_names.size(), platform::errors::InvalidArgument( @@ -203,15 +200,18 @@ class CoalesceTensorOpKernel : public framework::OpKernel { size, 0, platform::errors::InvalidArgument( "The number of tensor `%s`'s elements is 0.", var_names[i])); + auto len = + use_align + ? platform::Alignment(static_cast(size) * size_of_dtype, + place, align_size) / + size_of_dtype + : static_cast(size); + VLOG(4) << size << " " << len; ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() << ") " - << " addres:" << lod_tensors[i]->data() << ", "; - - *numel += use_align - ? platform::Alignment( - static_cast(size) * size_of_dtype, place) / - size_of_dtype - : static_cast(size); + << " addres:" << lod_tensors[i]->data() << " len: " << len + << ", "; + *numel += len; } VLOG(10) << ss.str(); } @@ -221,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override {} + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->IsRuntime()) { + return; + } + auto use_align = ctx->Attrs().Get("use_align"); + auto align_size = ctx->Attrs().Get("align_size"); + + auto dtype = static_cast( + ctx->Attrs().Get("dtype")); + size_t size_of_dtype = framework::SizeOfType(dtype); + + auto alignment = [](size_t size, size_t align_size) { + size_t remaining = size % align_size; + auto aligned_size = + remaining == 0 ? size : size + (align_size - remaining); + VLOG(4) << remaining << " " << size << " " << align_size << " " + << aligned_size; + return aligned_size; + }; + VLOG(4) << "align_size: " << align_size; + if (use_align && align_size > 0) { + int64_t numel = 0; + auto dims = ctx->GetInputsDim("Input"); + for (const auto &dim : dims) { + auto size = framework::product(dim); + auto len = use_align + ? alignment(static_cast(size) * size_of_dtype, + align_size) / + size_of_dtype + : static_cast(size); + numel += len; + } + ctx->SetOutputDim("FusedOutput", framework::make_ddim({numel})); + VLOG(4) << "FusedOutput size:" << framework::make_ddim({numel}); + } + } protected: framework::OpKernelType GetKernelTypeForVar( @@ -271,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker { "Whether to consider memory chunk and take alignment into " "account for inputs and outputs.") .SetDefault(true); + AddAttr("align_size", "The alignment size when use_align is True") + .SetDefault(-1); AddComment(R"DOC( CoalesceTensor Operator. @@ -314,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL( ops::CoalesceTensorOpKernel); #endif +#if defined(PADDLE_WITH_ASCEND_CL) +REGISTER_OP_CUDA_KERNEL( + coalesce_tensor, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); +#endif + #ifdef PADDLE_WITH_XPU REGISTER_OP_XPU_KERNEL( coalesce_tensor, @@ -343,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor) "In order to optionally take memory alignment into account when " "coalescing tensors. The default value is true to be compatible " "with before.", - true)); + true)) + .AddCheckpoint( + R"ROC( + Upgrade coalesce_tensor: add a new attribute [align_size].)ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "align_size", + "In order to optionally take memory alignment into account when " + "coalescing tensors. The default value is -1 and use the default " + "align_size " + "of each place to be compatible with before.", + -1)); diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc index a1550bde69..cbeb6285b6 100644 --- a/paddle/fluid/operators/sum_op_npu.cc +++ b/paddle/fluid/operators/sum_op_npu.cc @@ -35,9 +35,11 @@ class SumNPUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); int n = static_cast(x.size()); - PADDLE_ENFORCE_EQ(n > 1, true, - platform::errors::InvalidArgument( - "The size of Input(x) list must larger or equal 2")); + + if (n == 1) { + TensorCopy(*x[0], place, out); + return; + } auto stream = ctx.template device_context() diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc index 185646e732..383dbd23ca 100644 --- a/paddle/fluid/platform/device_memory_aligment.cc +++ b/paddle/fluid/platform/device_memory_aligment.cc @@ -16,22 +16,26 @@ limitations under the License. */ namespace paddle { namespace platform { -size_t Alignment(size_t size, const platform::Place &place) { - size_t alignment = 1024; - if (platform::is_cpu_place(place)) { - alignment = CpuMinChunkSize(); +size_t Alignment(size_t size, const platform::Place &place, int align_size) { + size_t alignment = 0; + if (align_size > 0) { + alignment = align_size; } else { + alignment = 1024; + if (platform::is_cpu_place(place)) { + alignment = CpuMinChunkSize(); + } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - alignment = GpuMinChunkSize(); + alignment = GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) - // TODO(wangxi): add XpuMinChunkSize - alignment = alignment; + alignment = alignment; #elif defined(PADDLE_WITH_ASCEND_CL) - alignment = NPUMinChunkSize(); + alignment = NPUMinChunkSize(); #else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Fluid is not compiled with CUDA or NPU.")); + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Fluid is not compiled with CUDA/XPU/NPU.")); #endif + } } size_t remaining = size % alignment; return remaining == 0 ? size : size + (alignment - remaining); diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h index e0f2f0f11c..dda526a755 100644 --- a/paddle/fluid/platform/device_memory_aligment.h +++ b/paddle/fluid/platform/device_memory_aligment.h @@ -22,9 +22,13 @@ limitations under the License. */ #elif defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/npu_info.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/npu_info.h" +#endif namespace paddle { namespace platform { -size_t Alignment(size_t size, const platform::Place &place); +size_t Alignment(size_t size, const platform::Place &place, + int align_size = -1); } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e2ddc20b8f..14eec7af4d 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -28,7 +28,7 @@ from . import framework from . import layers from . import unique_name from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name -from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops +from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm from .framework import program_guard from .initializer import Constant from .layer_helper import LayerHelper @@ -42,6 +42,7 @@ from functools import reduce from functools import cmp_to_key from .wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt +import warnings __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', @@ -68,7 +69,15 @@ class Optimizer(object): parameter_list=None, regularization=None, grad_clip=None, + flatten_param_grads=False, + align_size=-1, name=None): + """ + Args: + flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads. + If true, the parameters and gradients will be coalesce to contiguous mempry, + and the grad_clip ops / optimizer ops will be fuse to one operator. + """ # Because of the loop import, so place it in the function body from paddle.optimizer.lr import LRScheduler self._parameter_list = list( @@ -107,6 +116,8 @@ class Optimizer(object): self.regularization = regularization self._grad_clip = grad_clip self._learning_rate = learning_rate + self._flatten_param_grads = flatten_param_grads + self._align_size = align_size self._dtype = None # Infer the dtype form parameter @@ -126,7 +137,7 @@ class Optimizer(object): self._accumulators = defaultdict(lambda: dict()) # global_accumulator dict, {accum_name : acc_variable, ...} self._global_accumulators = {} - self.helper = None + self.helper = LayerHelper(self.__class__.__name__) self._opti_name_list = [] self._accumulators_holder = {} self._param_device_map = dict() @@ -739,7 +750,7 @@ class Optimizer(object): current_block.backward_block_idx] start = len(target_block.ops) - self.helper = LayerHelper(self.__class__.__name__) + self._update_param_device_map(parameters_and_grads, target_block) self._create_accumulators( target_block, @@ -958,7 +969,9 @@ class Optimizer(object): repeate_regularizer = False with framework.name_scope('regularization'): for param, grad in parameters_and_grads: - if not repeate_regularizer and param.regularizer is not None and regularization is not None: + if not repeate_regularizer and getattr( + param, 'regularizer', + None) is not None and regularization is not None: repeate_regularizer = True logging.info( "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " @@ -970,6 +983,83 @@ class Optimizer(object): params_and_grads.append((param, new_grad)) return params_and_grads + def flatten_param_grads(self, params_grads): + need_flatten_params = [] + need_flatten_grads = [] + for p, g in params_grads: + if g is None: + continue + g.persistable = True + if getattr(p, 'need_clip', True) is False or getattr( + p, 'regularizer', None) is not None: + warnings.warn( + "flatten_param_grads=True will be discarded since paramter '{}''s need_clip is False or " + "the regularizer is set".format(p.name)) + self._flatten_param_grads = False + return params_grads + + need_flatten_params.append(p) + need_flatten_grads.append(g) + + shape = [np.prod(p.shape) for p in need_flatten_params] + block = need_flatten_params[0].block + + flatten_param = self.helper.create_global_variable( + name='flatten_param', + persistable=True, + dtype=need_flatten_params[0].dtype, + shape=[np.sum(shape)], + belong_to_optimizer=True) + + flatten_param.trainable = True + flatten_param.optimize_attr = need_flatten_params[0].optimize_attr + flatten_param.regularizer = need_flatten_params[0].regularizer + + flatten_grad = self.helper.create_global_variable( + name='flatten_grad', + persistable=True, + dtype=need_flatten_grads[0].dtype, + shape=[np.sum(shape)], + belong_to_optimizer=True) + + with program_guard(default_main_program()): + block.append_op( + type="coalesce_tensor", + inputs={"Input": need_flatten_params}, + outputs={ + "Output": need_flatten_params, + "FusedOutput": flatten_param + }, + attrs={ + "copy_data": True, + "use_align": True, + "align_size": self._align_size, + "dtype": need_flatten_params[0].dtype + }) + + block.append_op( + type="coalesce_tensor", + inputs={"Input": need_flatten_grads}, + outputs={ + "Output": need_flatten_grads, + "FusedOutput": flatten_grad + }, + attrs={ + "copy_data": True, + "use_align": True, + "align_size": self._align_size, + "dtype": need_flatten_grads[0].dtype + }) + + #NOTE(zhiqiu): the initializer should be set after coalesce_tensor op, + # so the shape of flatten_param and flatten_grad will be inferred. + self.helper.set_variable_initializer( + flatten_param, initializer=Constant(0.0)) + self.helper.set_variable_initializer( + flatten_grad, initializer=Constant(0.0)) + + return [(flatten_param, flatten_grad)] + def apply_gradients(self, params_grads): """ Second part of `minimize`, appending optimization operators for @@ -992,9 +1082,14 @@ class Optimizer(object): # ... optimizer.apply_gradients(params_grads) """ - params_grads = sorted(params_grads, key=lambda x: x[0].name) + # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization. + if self._flatten_param_grads and self.regularization is None: + if self._grad_clip == None or isinstance(self._grad_clip, + ClipGradByGlobalNorm): + params_grads = self.flatten_param_grads(params_grads) + # 'optimizer(grad_clip)' or 'set_gradient_clip' if self._grad_clip is not None: params_grads = self._grad_clip(params_grads) @@ -2156,6 +2251,9 @@ class AdamOptimizer(Optimizer): The default value is False. use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow for whole model instead of creating beta_pow for each parameter. Default is false. + flatten_param_grads (bool, optional): Whether to flatten all parameters and gradients. Default is false. + align_size (int, optional): The alignment size when flatten parameters and gradients. Default is -1, which means + use same align_size as allocator. Examples: .. code-block:: python @@ -2266,7 +2364,9 @@ class AdamOptimizer(Optimizer): grad_clip=None, name=None, lazy_mode=False, - use_global_beta_pow=False): + use_global_beta_pow=False, + flatten_param_grads=False, + align_size=-1): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -2276,6 +2376,8 @@ class AdamOptimizer(Optimizer): parameter_list=parameter_list, regularization=regularization, grad_clip=grad_clip, + flatten_param_grads=flatten_param_grads, + align_size=align_size, name=name) self.type = "adam" self._beta1 = beta1 diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py index 6d39aa383c..2ad6cc388f 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py @@ -82,5 +82,30 @@ class TestSum2(OpTest): self.check_output_with_place(self.place, check_dygraph=False) +class TestSum3(OpTest): + def setUp(self): + self.set_npu() + self.init_dtype() + self.op_type = "sum" + self.place = paddle.NPUPlace(0) + + x0 = np.random.random((3, 3)).astype(self.dtype) + + self.inputs = {'X': [("x0", x0)]} + y = x0 + self.outputs = {'Out': y} + + self.attrs = {'use_mkldnn': False} + + def init_dtype(self): + self.dtype = np.float16 + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 715e66e563..78ced56913 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -636,12 +636,13 @@ class TestAdamOpV2(unittest.TestCase): paddle.enable_static() -class TestNetWithEpsilonTensor(unittest.TestCase): +class TestAdamOptimizer(unittest.TestCase): def _test(self, place, use_tensor=True, use_fluid_api=True, - use_global_beta_pow=False): + use_global_beta_pow=False, + flatten_param_grads=False): paddle.enable_static() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() @@ -649,94 +650,114 @@ class TestNetWithEpsilonTensor(unittest.TestCase): paddle.seed(SEED) np.random.seed(SEED) - a_np = np.random.random(size=(32, 32)).astype('float32') - b_np = np.random.random(size=(32, 32)).astype('float32') - label_np = np.random.randint(2, size=(32, 1)).astype('int64') + a_np = np.random.random(size=(2, 2)).astype('float32') + b_np = np.random.random(size=(2, 2)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + weight_attr1 = paddle.ParamAttr( + name="weight1", + initializer=fluid.initializer.Constant(value=1.0), + trainable=True) + weight_attr2 = paddle.ParamAttr( + name="weight2", + initializer=fluid.initializer.Constant(value=2.0), + trainable=True) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') - b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') - label = paddle.static.data( - name="label", shape=[32, 1], dtype='int64') - - sum = paddle.add(a, b) - z = paddle.pow(sum, 2.0) - - fc_1 = fluid.layers.fc(input=z, size=128) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - beta1_init = 0.9 - beta2_init = 0.999 - epsilon_init = 1e-8 - if use_tensor: - beta1 = fluid.layers.create_global_var( - shape=[1], - value=float(beta1_init), - dtype='float32', - persistable=True, - name="beta1") - beta2 = fluid.layers.create_global_var( - shape=[1], - value=float(beta2_init), - dtype='float32', - persistable=True, - name="beta2") - epsilon = fluid.layers.create_global_var( - shape=[1], - value=float(epsilon_init), - dtype='float32', - persistable=True, - name="epsilon") - if use_fluid_api: - adam = fluid.optimizer.Adam( - learning_rate=0.01, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - use_global_beta_pow=use_global_beta_pow) - else: - adam = paddle.optimizer.Adam( - learning_rate=0.01, - beta1=beta1, - beta2=beta2, - epsilon=epsilon) - else: - if use_fluid_api: - adam = fluid.optimizer.Adam( - learning_rate=0.01, - beta1=beta1_init, - beta2=beta2_init, - epsilon=epsilon_init, - use_global_beta_pow=use_global_beta_pow, - name='a') + with paddle.utils.unique_name.guard(): + a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 2], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1) + prediction = fluid.layers.fc(input=fc_1, + size=2, + param_attr=weight_attr2, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + beta1_init = 0.9 + beta2_init = 0.999 + epsilon_init = 1e-8 + if use_tensor: + beta1 = fluid.layers.create_global_var( + shape=[1], + value=float(beta1_init), + dtype='float32', + persistable=True, + name="beta1") + beta2 = fluid.layers.create_global_var( + shape=[1], + value=float(beta2_init), + dtype='float32', + persistable=True, + name="beta2") + epsilon = fluid.layers.create_global_var( + shape=[1], + value=float(epsilon_init), + dtype='float32', + persistable=True, + name="epsilon") + if use_fluid_api: + adam = fluid.optimizer.Adam( + learning_rate=0.01, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + use_global_beta_pow=use_global_beta_pow, + flatten_param_grads=flatten_param_grads, + align_size=256, + grad_clip=clip) + else: + adam = paddle.optimizer.Adam( + learning_rate=0.01, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + grad_clip=clip) else: - adam = fluid.optimizer.Adam( - learning_rate=0.01, - beta1=beta1_init, - beta2=beta2_init, - epsilon=epsilon_init) - - adam.minimize(loss) - - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(10): - - pred_res, loss_res = exe.run( - main_prog, - feed={"a": a_np, - "b": b_np, - "label": label_np}, - fetch_list=[prediction, loss]) - - print("Epoch {} | Prediction[0]: {}, Loss: {}".format(epoch, pred_res[ - 0], loss_res)) - paddle.disable_static() - return pred_res, loss_res + if use_fluid_api: + adam = fluid.optimizer.Adam( + learning_rate=0.01, + beta1=beta1_init, + beta2=beta2_init, + epsilon=epsilon_init, + use_global_beta_pow=use_global_beta_pow, + flatten_param_grads=flatten_param_grads, + align_size=256, + grad_clip=clip) + else: + adam = fluid.optimizer.Adam( + learning_rate=0.01, + beta1=beta1_init, + beta2=beta2_init, + epsilon=epsilon_init, + grad_clip=clip) + + adam.minimize(loss) + + scope = fluid.Scope() + with fluid.scope_guard(scope): + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(10): + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + paddle.disable_static() + return pred_res, loss_res def _test_with_place(self, place): preds = [] @@ -745,10 +766,12 @@ class TestNetWithEpsilonTensor(unittest.TestCase): for use_tensor in [True, False]: for use_fluid_api in [True, False]: for use_global_beta_pow in [True, False]: - pred, loss = self._test(place, use_tensor, use_fluid_api, - use_global_beta_pow) - preds.append(pred) - losses.append(loss) + for flatten_param_grads in [True, False]: + pred, loss = self._test( + place, use_tensor, use_fluid_api, + use_global_beta_pow, flatten_param_grads) + preds.append(pred) + losses.append(loss) for pred in preds: self.assertTrue(np.allclose(pred, preds[0])) for loss in losses: @@ -760,6 +783,33 @@ class TestNetWithEpsilonTensor(unittest.TestCase): if core.is_compiled_with_cuda(): self._test_with_place(paddle.CUDAPlace(0)) + def test_adam_flatten_param_grads_with_regularizer(self): + # flatten_param_grads + regularizer is not supported yet. + paddle.enable_static() + main = fluid.Program() + weight_attr = paddle.ParamAttr( + name="weight1", + initializer=fluid.initializer.Constant(value=1.0), + regularizer=fluid.regularizer.L1DecayRegularizer( + regularization_coeff=0.1), + trainable=True) + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1, + act=None, + param_attr=weight_attr) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + adam = fluid.optimizer.AdamOptimizer( + 0.01, flatten_param_grads=True, align_size=256) + adam.minimize(avg_cost) + paddle.disable_static() + + self.assertEqual(adam._flatten_param_grads, False) + def test_adam_exception(self): paddle.enable_static() a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') -- GitLab