[NPU] flatten params and grads, fuse grad_clip and optimizer op (#33461)

* enable npu alignment * support flatten_params/grads * support clip by global norm * remove memset in coalesce_tensor_op * fix npu kernel of sum op when input is one tensor * add ut for flatten_param_grads+regularizer * fix ut * fix typo

[NPU] flatten params and grads, fuse grad_clip and optimizer op (#33461)
* enable npu alignment * support flatten_params/grads * support clip by global norm * remove memset in coalesce_tensor_op * fix npu kernel of sum op when input is one tensor * add ut for flatten_param_grads+regularizer * fix ut * fix typo
c269a160 · Leo Chen · GitHub · fa821ef9 · c269a160 · c269a160
9 changed file
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -60,6 +60,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
            << dst_place;
    return;
  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
 #ifdef PADDLE_WITH_MKLDNN
  auto size = src.layout() == DataLayout::kMKLDNN

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -30,6 +30,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                  platform::CPUPlace,
                                                  const void* src, size_t num) {
  if (UNLIKELY(num == 0)) return;
+  VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
  std::memcpy(dst, src, num);
 }

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
    auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
    bool use_align = context.Attr<bool>("use_align");
+    auto align_size = context.Attr<int>("align_size");
    if (context.Attr<bool>("check_name")) {
      for (size_t i = 0; i < in_var_names.size(); ++i) {
@@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
        context.Attr<int>("dtype"));
    size_t size_of_dtype = framework::SizeOfType(dtype);
    GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
-                       context.GetPlace(), use_align);
+                       context.GetPlace(), use_align, align_size);
    // Alloc the continuous space
    auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
@@ -113,9 +114,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
        framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
                              &sub_tensor);
-        offset +=
+        offset += use_align
-            use_align
+                      ? platform::Alignment(len * size_of_dtype,
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                                            context.GetPlace(), align_size) /
                            size_of_dtype
                      : len;
      }
@@ -134,9 +135,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
          framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
                                &sub_tensor);
        }
-        offset +=
+        offset += use_align
-            use_align
+                      ? platform::Alignment(len * size_of_dtype,
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                                            context.GetPlace(), align_size) /
                            size_of_dtype
                      : len;
      }
@@ -146,28 +147,24 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
    offset = 0;
    std::stringstream ss;
    ss << "alloc_space_for_vars: ";
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    platform::NPUMemsetAsync(
-        static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
-        0.0, fused_tensor->numel() * sizeof(T), stream);
-#endif
    for (size_t i = 0; i < out_tensors.size(); ++i) {
      size_t len = static_cast<size_t>(out_tensors[i]->numel());
      auto dim = out_tensors[i]->dims();
+      VLOG(4) << len << " " << dim << " " << offset;
      out_tensors[i]
          ->ShareDataWith(fused_tensor->Slice(
              static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
          .Resize(dim);
      len = use_align
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                ? platform::Alignment(len * size_of_dtype, context.GetPlace(),
+                                      align_size) /
                      size_of_dtype
                : len;
-      offset += len;
      ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
-         << " address: " << out_tensors[i]->data<void>() << ", ";
+         << " address: " << out_tensors[i]->data<void>() << " len: " << len
+         << ", ";
+      offset += len;
    }
    PADDLE_ENFORCE_EQ(
        (int64_t)offset, fused_tensor->numel(),
@@ -183,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
      const std::vector<const framework::LoDTensor *> &lod_tensors,
      const std::vector<std::string> var_names, size_t *numel,
      const size_t &size_of_dtype, const platform::Place &place,
-      const bool use_align = true) const {
+      const bool use_align = true, const int align_size = -1) const {
    PADDLE_ENFORCE_EQ(
        lod_tensors.size(), var_names.size(),
        platform::errors::InvalidArgument(
@@ -203,15 +200,18 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
          size, 0,
          platform::errors::InvalidArgument(
              "The number of tensor `%s`'s elements is 0.", var_names[i]));
-      ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
+      auto len =
-         << ") "
+          use_align
-         << " addres:" << lod_tensors[i]->data<void>() << ", ";
+              ? platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
+                                    place, align_size) /
-      *numel += use_align
-                    ? platform::Alignment(
-                          static_cast<size_t>(size) * size_of_dtype, place) /
                    size_of_dtype
              : static_cast<size_t>(size);
+      VLOG(4) << size << " " << len;
+      ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
+         << ") "
+         << " addres:" << lod_tensors[i]->data<void>() << " len: " << len
+         << ", ";
+      *numel += len;
    }
    VLOG(10) << ss.str();
  }
@@ -221,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->IsRuntime()) {
+      return;
+    }
+    auto use_align = ctx->Attrs().Get<bool>("use_align");
+    auto align_size = ctx->Attrs().Get<int>("align_size");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        ctx->Attrs().Get<int>("dtype"));
+    size_t size_of_dtype = framework::SizeOfType(dtype);
+    auto alignment = [](size_t size, size_t align_size) {
+      size_t remaining = size % align_size;
+      auto aligned_size =
+          remaining == 0 ? size : size + (align_size - remaining);
+      VLOG(4) << remaining << " " << size << " " << align_size << " "
+              << aligned_size;
+      return aligned_size;
+    };
+    VLOG(4) << "align_size: " << align_size;
+    if (use_align && align_size > 0) {
+      int64_t numel = 0;
+      auto dims = ctx->GetInputsDim("Input");
+      for (const auto &dim : dims) {
+        auto size = framework::product(dim);
+        auto len = use_align
+                       ? alignment(static_cast<size_t>(size) * size_of_dtype,
+                                   align_size) /
+                             size_of_dtype
+                       : static_cast<size_t>(size);
+        numel += len;
+      }
+      ctx->SetOutputDim("FusedOutput", framework::make_ddim({numel}));
+      VLOG(4) << "FusedOutput size:" << framework::make_ddim({numel});
+    }
+  }
 protected:
  framework::OpKernelType GetKernelTypeForVar(
@@ -271,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Whether to consider memory chunk and take alignment into "
                  "account for inputs and outputs.")
        .SetDefault(true);
+    AddAttr<int>("align_size", "The alignment size when use_align is True")
+        .SetDefault(-1);
    AddComment(R"DOC(
 CoalesceTensor Operator.
@@ -314,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL(
    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+REGISTER_OP_CUDA_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
+#endif
 #ifdef PADDLE_WITH_XPU
 REGISTER_OP_XPU_KERNEL(
    coalesce_tensor,
@@ -343,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor)
            "In order to optionally take memory alignment into account when "
            "coalescing tensors. The default value is true to be compatible "
            "with before.",
-            true));
+            true))
+    .AddCheckpoint(
+        R"ROC(
+                Upgrade coalesce_tensor: add a new attribute [align_size].)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "align_size",
+            "In order to optionally take memory alignment into account when "
+            "coalescing tensors. The default value is -1 and use the default "
+            "align_size "
+            "of each place to be compatible with before.",
+            -1));
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -35,9 +35,11 @@ class SumNPUKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    int n = static_cast<int>(x.size());
-    PADDLE_ENFORCE_EQ(n > 1, true,
-                      platform::errors::InvalidArgument(
+    if (n == 1) {
-                          "The size of Input(x) list must larger or equal 2"));
+      TensorCopy(*x[0], place, out);
+      return;
+    }
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()

--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -16,23 +16,27 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
-size_t Alignment(size_t size, const platform::Place &place) {
+size_t Alignment(size_t size, const platform::Place &place, int align_size) {
-  size_t alignment = 1024;
+  size_t alignment = 0;
+  if (align_size > 0) {
+    alignment = align_size;
+  } else {
+    alignment = 1024;
    if (platform::is_cpu_place(place)) {
      alignment = CpuMinChunkSize();
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      alignment = GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
-    // TODO(wangxi): add XpuMinChunkSize
      alignment = alignment;
 #elif defined(PADDLE_WITH_ASCEND_CL)
      alignment = NPUMinChunkSize();
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Fluid is not compiled with CUDA or NPU."));
+          "Fluid is not compiled with CUDA/XPU/NPU."));
 #endif
    }
+  }
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
 }

--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -22,9 +22,13 @@ limitations under the License. */
 #elif defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/npu_info.h"
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/npu_info.h"
+#endif
 namespace paddle {
 namespace platform {
-size_t Alignment(size_t size, const platform::Place &place);
+size_t Alignment(size_t size, const platform::Place &place,
+                 int align_size = -1);
 }  // namespace platform
 }  // namespace paddle
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -28,7 +28,7 @@ from . import framework
 from . import layers
 from . import unique_name
 from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
-from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
+from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm
 from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
@@ -42,6 +42,7 @@ from functools import reduce
 from functools import cmp_to_key
 from .wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
+import warnings
 __all__ = [
    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
@@ -68,7 +69,15 @@ class Optimizer(object):
                 parameter_list=None,
                 regularization=None,
                 grad_clip=None,
+                 flatten_param_grads=False,
+                 align_size=-1,
                 name=None):
+        """
+        Args:
+            flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads. 
+                If true, the parameters and gradients will be coalesce to contiguous mempry, 
+                and the grad_clip ops / optimizer ops will be fuse to one operator.
+        """
        # Because of the loop import, so place it in the function body
        from paddle.optimizer.lr import LRScheduler
        self._parameter_list = list(
@@ -107,6 +116,8 @@ class Optimizer(object):
        self.regularization = regularization
        self._grad_clip = grad_clip
        self._learning_rate = learning_rate
+        self._flatten_param_grads = flatten_param_grads
+        self._align_size = align_size
        self._dtype = None
        # Infer the dtype form parameter
@@ -126,7 +137,7 @@ class Optimizer(object):
        self._accumulators = defaultdict(lambda: dict())
        # global_accumulator dict, {accum_name : acc_variable, ...}
        self._global_accumulators = {}
-        self.helper = None
+        self.helper = LayerHelper(self.__class__.__name__)
        self._opti_name_list = []
        self._accumulators_holder = {}
        self._param_device_map = dict()
@@ -739,7 +750,7 @@ class Optimizer(object):
                current_block.backward_block_idx]
        start = len(target_block.ops)
-        self.helper = LayerHelper(self.__class__.__name__)
        self._update_param_device_map(parameters_and_grads, target_block)
        self._create_accumulators(
            target_block,
@@ -958,7 +969,9 @@ class Optimizer(object):
            repeate_regularizer = False
            with framework.name_scope('regularization'):
                for param, grad in parameters_and_grads:
-                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                    if not repeate_regularizer and getattr(
+                            param, 'regularizer',
+                            None) is not None and regularization is not None:
                        repeate_regularizer = True
                        logging.info(
                            "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
@@ -970,6 +983,83 @@ class Optimizer(object):
                        params_and_grads.append((param, new_grad))
        return params_and_grads
+    def flatten_param_grads(self, params_grads):
+        need_flatten_params = []
+        need_flatten_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            g.persistable = True
+            if getattr(p, 'need_clip', True) is False or getattr(
+                    p, 'regularizer', None) is not None:
+                warnings.warn(
+                    "flatten_param_grads=True will be discarded since paramter '{}''s need_clip is False or "
+                    "the regularizer is set".format(p.name))
+                self._flatten_param_grads = False
+                return params_grads
+            need_flatten_params.append(p)
+            need_flatten_grads.append(g)
+        shape = [np.prod(p.shape) for p in need_flatten_params]
+        block = need_flatten_params[0].block
+        flatten_param = self.helper.create_global_variable(
+            name='flatten_param',
+            persistable=True,
+            dtype=need_flatten_params[0].dtype,
+            shape=[np.sum(shape)],
+            belong_to_optimizer=True)
+        flatten_param.trainable = True
+        flatten_param.optimize_attr = need_flatten_params[0].optimize_attr
+        flatten_param.regularizer = need_flatten_params[0].regularizer
+        flatten_grad = self.helper.create_global_variable(
+            name='flatten_grad',
+            persistable=True,
+            dtype=need_flatten_grads[0].dtype,
+            shape=[np.sum(shape)],
+            belong_to_optimizer=True)
+        with program_guard(default_main_program()):
+            block.append_op(
+                type="coalesce_tensor",
+                inputs={"Input": need_flatten_params},
+                outputs={
+                    "Output": need_flatten_params,
+                    "FusedOutput": flatten_param
+                },
+                attrs={
+                    "copy_data": True,
+                    "use_align": True,
+                    "align_size": self._align_size,
+                    "dtype": need_flatten_params[0].dtype
+                })
+            block.append_op(
+                type="coalesce_tensor",
+                inputs={"Input": need_flatten_grads},
+                outputs={
+                    "Output": need_flatten_grads,
+                    "FusedOutput": flatten_grad
+                },
+                attrs={
+                    "copy_data": True,
+                    "use_align": True,
+                    "align_size": self._align_size,
+                    "dtype": need_flatten_grads[0].dtype
+                })
+        #NOTE(zhiqiu): the initializer should be set after coalesce_tensor op,
+        # so the shape of flatten_param and flatten_grad will be inferred.
+        self.helper.set_variable_initializer(
+            flatten_param, initializer=Constant(0.0))
+        self.helper.set_variable_initializer(
+            flatten_grad, initializer=Constant(0.0))
+        return [(flatten_param, flatten_grad)]
    def apply_gradients(self, params_grads):
        """
        Second part of `minimize`, appending optimization operators for
@@ -992,9 +1082,14 @@ class Optimizer(object):
                # ...
                optimizer.apply_gradients(params_grads)
        """
        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+        # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
+        if self._flatten_param_grads and self.regularization is None:
+            if self._grad_clip == None or isinstance(self._grad_clip,
+                                                     ClipGradByGlobalNorm):
+                params_grads = self.flatten_param_grads(params_grads)
        # 'optimizer(grad_clip)' or 'set_gradient_clip'
        if self._grad_clip is not None:
            params_grads = self._grad_clip(params_grads)
@@ -2156,6 +2251,9 @@ class AdamOptimizer(Optimizer):
            The default value is False.
        use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow 
            for whole model instead of creating beta_pow for each parameter. Default is false.
+        flatten_param_grads (bool, optional): Whether to flatten all parameters and gradients. Default is false.
+        align_size (int, optional): The alignment size when flatten parameters and gradients. Default is -1, which means
+            use same align_size as allocator. 
    Examples:
        .. code-block:: python
@@ -2266,7 +2364,9 @@ class AdamOptimizer(Optimizer):
                 grad_clip=None,
                 name=None,
                 lazy_mode=False,
-                 use_global_beta_pow=False):
+                 use_global_beta_pow=False,
+                 flatten_param_grads=False,
+                 align_size=-1):
        assert learning_rate is not None
        assert beta1 is not None
        assert beta2 is not None
@@ -2276,6 +2376,8 @@ class AdamOptimizer(Optimizer):
            parameter_list=parameter_list,
            regularization=regularization,
            grad_clip=grad_clip,
+            flatten_param_grads=flatten_param_grads,
+            align_size=align_size,
            name=name)
        self.type = "adam"
        self._beta1 = beta1

--- a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -82,5 +82,30 @@ class TestSum2(OpTest):
        self.check_output_with_place(self.place, check_dygraph=False)
+class TestSum3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.NPUPlace(0)
+        x0 = np.random.random((3, 3)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0)]}
+        y = x0
+        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': False}
+    def init_dtype(self):
+        self.dtype = np.float16
+    def set_npu(self):
+        self.__class__.use_npu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -636,12 +636,13 @@ class TestAdamOpV2(unittest.TestCase):
        paddle.enable_static()
-class TestNetWithEpsilonTensor(unittest.TestCase):
+class TestAdamOptimizer(unittest.TestCase):
    def _test(self,
              place,
              use_tensor=True,
              use_fluid_api=True,
-              use_global_beta_pow=False):
+              use_global_beta_pow=False,
+              flatten_param_grads=False):
        paddle.enable_static()
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
@@ -649,21 +650,34 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
        paddle.seed(SEED)
        np.random.seed(SEED)
-        a_np = np.random.random(size=(32, 32)).astype('float32')
+        a_np = np.random.random(size=(2, 2)).astype('float32')
-        b_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(2, 2)).astype('float32')
-        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+        weight_attr1 = paddle.ParamAttr(
+            name="weight1",
+            initializer=fluid.initializer.Constant(value=1.0),
+            trainable=True)
+        weight_attr2 = paddle.ParamAttr(
+            name="weight2",
+            initializer=fluid.initializer.Constant(value=2.0),
+            trainable=True)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
        with paddle.static.program_guard(main_prog, startup_prog):
-            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            with paddle.utils.unique_name.guard():
-            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+                a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
+                b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
                label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+                    name="label", shape=[2, 1], dtype='int64')
                sum = paddle.add(a, b)
                z = paddle.pow(sum, 2.0)
-            fc_1 = fluid.layers.fc(input=z, size=128)
+                fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+                prediction = fluid.layers.fc(input=fc_1,
+                                             size=2,
+                                             param_attr=weight_attr2,
+                                             act='softmax')
                cost = fluid.layers.cross_entropy(input=prediction, label=label)
                loss = fluid.layers.reduce_mean(cost)
@@ -695,13 +709,17 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
                            beta1=beta1,
                            beta2=beta2,
                            epsilon=epsilon,
-                        use_global_beta_pow=use_global_beta_pow)
+                            use_global_beta_pow=use_global_beta_pow,
+                            flatten_param_grads=flatten_param_grads,
+                            align_size=256,
+                            grad_clip=clip)
                    else:
                        adam = paddle.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1,
                            beta2=beta2,
-                        epsilon=epsilon)
+                            epsilon=epsilon,
+                            grad_clip=clip)
                else:
                    if use_fluid_api:
                        adam = fluid.optimizer.Adam(
@@ -710,31 +728,34 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
                            beta2=beta2_init,
                            epsilon=epsilon_init,
                            use_global_beta_pow=use_global_beta_pow,
-                        name='a')
+                            flatten_param_grads=flatten_param_grads,
+                            align_size=256,
+                            grad_clip=clip)
                    else:
                        adam = fluid.optimizer.Adam(
                            learning_rate=0.01,
                            beta1=beta1_init,
                            beta2=beta2_init,
-                        epsilon=epsilon_init)
+                            epsilon=epsilon_init,
+                            grad_clip=clip)
                adam.minimize(loss)
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
            exe = paddle.static.Executor(place)
            exe.run(startup_prog)
            print("Start run on {}".format(place))
            for epoch in range(10):
                pred_res, loss_res = exe.run(
                    main_prog,
                    feed={"a": a_np,
                          "b": b_np,
                          "label": label_np},
                    fetch_list=[prediction, loss])
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-        print("Epoch {} | Prediction[0]: {}, Loss: {}".format(epoch, pred_res[
+                    epoch, pred_res[0], loss_res))
-            0], loss_res))
            paddle.disable_static()
            return pred_res, loss_res
@@ -745,8 +766,10 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
        for use_tensor in [True, False]:
            for use_fluid_api in [True, False]:
                for use_global_beta_pow in [True, False]:
-                    pred, loss = self._test(place, use_tensor, use_fluid_api,
+                    for flatten_param_grads in [True, False]:
-                                            use_global_beta_pow)
+                        pred, loss = self._test(
+                            place, use_tensor, use_fluid_api,
+                            use_global_beta_pow, flatten_param_grads)
                        preds.append(pred)
                        losses.append(loss)
        for pred in preds:
@@ -760,6 +783,33 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
        if core.is_compiled_with_cuda():
            self._test_with_place(paddle.CUDAPlace(0))
+    def test_adam_flatten_param_grads_with_regularizer(self):
+        # flatten_param_grads + regularizer is not supported yet.
+        paddle.enable_static()
+        main = fluid.Program()
+        weight_attr = paddle.ParamAttr(
+            name="weight1",
+            initializer=fluid.initializer.Constant(value=1.0),
+            regularizer=fluid.regularizer.L1DecayRegularizer(
+                regularization_coeff=0.1),
+            trainable=True)
+        with fluid.program_guard(main):
+            x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x,
+                                        size=1,
+                                        act=None,
+                                        param_attr=weight_attr)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+            adam = fluid.optimizer.AdamOptimizer(
+                0.01, flatten_param_grads=True, align_size=256)
+            adam.minimize(avg_cost)
+            paddle.disable_static()
+            self.assertEqual(adam._flatten_param_grads, False)
    def test_adam_exception(self):
        paddle.enable_static()
        a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')