Cherry pick for fix of operator precision. (#52705)

* Fix scale kernel for low precision, cherry pick #50998. * Fix the FP16 precision problem of add_n. (#50129) * Change squared_l2_norm to reuse ReduceKernel, and register fp16 and bf16 kernel, which is cherry pick #48315. * Cherry-pick the fix of MPTypeTrait in KP, which is implemented in #50993. * Cherry-pick the multi-precision support of AdamW for bf16, #48041. * Fix compiling error. * Cherry-pick the fix of CubTensorReduceImpl for bfloat16 in #50993. * Fix unittest. --------- Co-authored-by: N liuruyan <44316842+liuruyan@users.noreply.github.com>

Cherry pick for fix of operator precision. (#52705)
* Fix scale kernel for low precision, cherry pick #50998. * Fix the FP16 precision problem of add_n. (#50129) * Change squared_l2_norm to reuse ReduceKernel, and register fp16 and bf16 kernel, which is cherry pick #48315. * Cherry-pick the fix of MPTypeTrait in KP, which is implemented in #50993. * Cherry-pick the multi-precision support of AdamW for bf16, #48041. * Fix compiling error. * Cherry-pick the fix of CubTensorReduceImpl for bfloat16 in #50993. * Fix unittest. --------- Co-authored-by: N liuruyan <44316842+liuruyan@users.noreply.github.com>
d1e8b1e2 · Yiqun Liu · GitHub · d12588d2 · d1e8b1e2 · d1e8b1e2
11 changed file
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -986,9 +986,11 @@ template <typename Tx,
          template <typename>
          class ReduceOp,
          typename TransformOp>
-static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
+static
+    typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value &&
+                                !std::is_same<Tx, phi::dtype::bfloat16>::value,
                            void>::type
-CubTensorReduceImpl(const Tx* x_data,
+    CubTensorReduceImpl(const Tx* x_data,
                        Ty* y_data,
                        const TransformOp& transform,
                        int reduce_num,
@@ -1037,6 +1039,23 @@ CubTensorReduceImpl(const Tx* x_data,
  PADDLE_THROW(phi::errors::InvalidArgument(
      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
+
+template <typename Tx,
+          typename Ty,
+          template <typename>
+          class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<std::is_same<Tx, phi::dtype::bfloat16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const KPDevice& dev_ctx,
+                    KPStream stream) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "Tx should not be bfloat16 when using cub::DeviceReduce::Reduce()."));
+}
 #endif  // PADDLE_WITH_XPU_KP

 template <typename Tx,
@@ -1081,7 +1100,8 @@ void ReduceKernel(const KPDevice& dev_ctx,

  config.SetOutputData(y_data, dev_ctx, &tmp);
  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
-  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+  constexpr bool kIsTxBF16 = std::is_same<Tx, phi::dtype::bfloat16>::value;
+  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16;
 #ifndef PADDLE_WITH_XPU_KP
  if (use_cub_reduce) {
    if (is_mean) {

--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,10 +14,10 @@

 #include "paddle/phi/kernels/add_n_kernel.h"

-#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"

 namespace phi {

@@ -38,16 +38,18 @@ __global__ void Sum2CUDAKernel(const T *in_0,
 template <class T>
 __global__ void SumArrayCUDAKernel(
    T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
  int id = blockIdx.x * blockDim.x + threadIdx.x;
  while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
+    MPType total(read_dst ? static_cast<MPType>(out[id])
+                          : static_cast<MPType>(0));
    for (int i = 0; i < in_size; ++i) {
      const T *tmp = in[i];
      if (tmp) {
-        total += tmp[id];
+        total += static_cast<MPType>(tmp[id]);
      }
    }
-    out[id] = total;
+    out[id] = static_cast<T>(total);
    id += blockDim.x * gridDim.x;
  }
 }
@@ -116,11 +118,12 @@ void AddNKernel(const Context &dev_ctx,
    int64_t length_0 = in_0.numel();
    int64_t length_1 = in_1.numel();
    if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
+      using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
      auto result = EigenVector<T>::Flatten(*out);
      auto &place = *dev_ctx.eigen_device();
-      auto in_0_e = EigenVector<T>::Flatten(in_0);
-      auto in_1_e = EigenVector<T>::Flatten(in_1);
-      result.device(place) = in_0_e + in_1_e;
+      auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
+      auto in_1_e = EigenVector<T>::Flatten(in_1).template cast<MPType>();
+      result.device(place) = (in_0_e + in_1_e).template cast<T>();
    } else if (length_0 && in_0.IsInitialized()) {
      auto result = EigenVector<T>::Flatten(*out);
      auto &place = *dev_ctx.eigen_device();

--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -15,28 +15,30 @@ limitations under the License. */
 #include "paddle/phi/kernels/scale_kernel.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"

 namespace phi {

-template <typename InT>
+template <typename DataT, typename ParamT>
 struct ScaleFunctor {
-  InT bias;
-  InT scale;
+  ParamT bias;
+  ParamT scale;
  bool bias_after_scale;

-  ScaleFunctor(InT scale_data, InT bias_data, bool is_bias_after_sacle)
+  ScaleFunctor(ParamT scale_data, ParamT bias_data, bool is_bias_after_sacle)
      : bias(bias_data),
        scale(scale_data),
        bias_after_scale(is_bias_after_sacle) {}

-  __device__ __forceinline__ InT operator()(const InT x) const {
+  __device__ __forceinline__ DataT operator()(const DataT x) const {
    if (bias_after_scale) {
-      return scale * x + bias;
+      return static_cast<DataT>(scale * static_cast<ParamT>(x) + bias);
    } else {
-      return scale * (x + bias);
+      return static_cast<DataT>(scale * (static_cast<ParamT>(x) + bias));
    }
  }
 };
@@ -48,16 +50,21 @@ void ScaleKernel(const Context& dev_ctx,
                 float bias,
                 bool bias_after_scale,
                 DenseTensor* out) {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  std::vector<const DenseTensor*> inputs;
  std::vector<DenseTensor*> outputs;
  inputs.emplace_back(&x);
  outputs.emplace_back(out);
  dev_ctx.template Alloc<T>(out);
+  if (x.numel() <= 0 || (!x.IsInitialized())) {
+    return;
+  }
  phi::funcs::ElementwiseKernel<T>(
      dev_ctx,
      inputs,
      &outputs,
-      ScaleFunctor<T>(scale.to<T>(), static_cast<T>(bias), bias_after_scale));
+      ScaleFunctor<T, MT>(
+          scale.to<MT>(), static_cast<MT>(bias), bias_after_scale));
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
@@ -15,12 +15,47 @@
 #include "paddle/phi/kernels/squared_l2_norm_grad_kernel.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/squared_l2_norm_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+
+namespace phi {
+/**
+ * x*y*2.0
+ */
+template <typename T>
+struct DoubleMulFunctor {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b * a * static_cast<T>(2.0f);
+  }
+};
+
+template <typename T, typename Context>
+void SquaredL2NormGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& dout,
+                             DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+
+  PADDLE_ENFORCE_EQ(
+      dout.numel(),
+      1,
+      phi::errors::InvalidArgument(
+          "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
+  std::vector<const DenseTensor*> ins{&x, &dout};
+  std::vector<DenseTensor*> outs{dx};
+
+  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+      dev_ctx, ins, &outs, -1, phi::DoubleMulFunctor<T>());
+}
+}  // namespace phi

 PD_REGISTER_KERNEL(squared_l2_norm_grad,
                   GPU,
                   ALL_LAYOUT,
                   phi::SquaredL2NormGradKernel,
                   float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
@@ -15,9 +15,34 @@
 #include "paddle/phi/kernels/squared_l2_norm_kernel.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/squared_l2_norm_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"

-PD_REGISTER_KERNEL(
-    squared_l2_norm, GPU, ALL_LAYOUT, phi::SquaredL2NormKernel, float, double) {
+namespace phi {
+
+template <typename T, typename Context>
+void SquaredL2NormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<int> origin_reduce_dims;
+  for (size_t i = 0; i < x.dims().size(); i++) {
+    origin_reduce_dims.push_back(i);
+  }
+  phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::SquareFunctor<T, T>>(
+      dev_ctx, x, out, kps::SquareFunctor<T, T>(), origin_reduce_dims, false);
 }
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(squared_l2_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SquaredL2NormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -52,6 +52,12 @@ class MPTypeTrait<phi::dtype::float16> {
  using Type = float;
 };

+template <>
+class MPTypeTrait<phi::dtype::bfloat16> {
+ public:
+  using Type = float;
+};
+
 /**
 * @brief Will be used in BlockYReduce, get the index of reduce_num in shared
 * memory.

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -32,8 +32,11 @@ from .framework import default_main_program
 from paddle import _C_ops, _legacy_C_ops

 __all__ = [
-    'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
-    'ClipGradByNorm', 'ClipGradByGlobalNorm'
+    'set_gradient_clip',
+    'ErrorClipByValue',
+    'ClipGradByValue',
+    'ClipGradByNorm',
+    'ClipGradByGlobalNorm',
 ]

 _clip_by_global_norm_using_mp_type_flag = False
@@ -52,7 +55,8 @@ def _clip_by_global_norm_using_mp_type(*args):


 def _cast_to_mp_type_if_enabled(x):
-    if (x.dtype == core.VarDesc.VarType.FP16
+    if (
+        x.dtype == core.VarDesc.VarType.FP16
        or x.dtype == core.VarDesc.VarType.BF16
    ) and _clip_by_global_norm_using_mp_type():
        return x.astype(core.VarDesc.VarType.FP32)
@@ -66,8 +70,7 @@ def _squared_l2_norm(x):
    """

    x = _cast_to_mp_type_if_enabled(x)
-    if core.is_compiled_with_xpu(
-    ) or x.dtype == core.VarDesc.VarType.FP16 or x.dtype == core.VarDesc.VarType.BF16:
+    if core.is_compiled_with_xpu():
        square = layers.square(x)
        sum_square = layers.reduce_sum(square)
        return sum_square
@@ -78,7 +81,9 @@ def _squared_l2_norm(x):
        return _legacy_C_ops.squared_l2_norm(x)

    op_type = 'squared_l2_norm'
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'float16', 'uint16'], op_type
+    )
    helper = LayerHelper(op_type, **locals())
    out = helper.create_variable_for_type_inference(x.dtype)

@@ -89,7 +94,6 @@ def _squared_l2_norm(x):


 class BaseErrorClipAttr(object):
-
    def __str__(self):
        raise NotImplementedError()

@@ -164,8 +168,9 @@ def error_clip_callback(block, context):
    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
        fwd_var = block._var_recursive(grad_to_var[grad_n])
        error_clip = getattr(fwd_var, "error_clip", None)
-        if not (error_clip is None
-                or isinstance(error_clip, BaseErrorClipAttr)):
+        if not (
+            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
+        ):
            raise TypeError(
                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
            )
@@ -174,7 +179,6 @@ def error_clip_callback(block, context):


 class ClipGradBase(object):
-
    def __init__(self):
        super(ClipGradBase, self).__init__()

@@ -197,7 +201,8 @@ class ClipGradBase(object):
                    warnings.warn(
                        "'set_gradient_clip' will be ineffective, because you have "
                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
-                        "is redundant and you can remove it.")
+                        "is redundant and you can remove it."
+                    )
                    break
            return self._static_clip(params_grads)

@@ -252,7 +257,7 @@ class ClipGradByValue(ClipGradBase):
    def __init__(self, max, min=None):
        super(ClipGradByValue, self).__init__()
        if min is None:
-            assert (max > 0.0)
+            assert max > 0.0
            min = -max
        self.max = float(max)
        self.min = float(min)
@@ -468,10 +473,9 @@ class ClipGradByGlobalNorm(ClipGradBase):
            sdg.step()
    """

-    def __init__(self,
-                 clip_norm,
-                 group_name="default_group",
-                 auto_skip_clip=False):
+    def __init__(
+        self, clip_norm, group_name="default_group", auto_skip_clip=False
+    ):
        super(ClipGradByGlobalNorm, self).__init__()
        self.clip_norm = float(clip_norm)
        self.group_name = group_name
@@ -503,7 +507,10 @@ class ClipGradByGlobalNorm(ClipGradBase):
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)

            sum_square = _squared_l2_norm(merge_grad)
-            if sum_square.dtype == core.VarDesc.VarType.FP16 or sum_square.dtype == core.VarDesc.VarType.BF16:
+            if (
+                sum_square.dtype == core.VarDesc.VarType.FP16
+                or sum_square.dtype == core.VarDesc.VarType.BF16
+            ):
                sum_square_list_fp16.append(sum_square)
            elif sum_square.dtype == core.VarDesc.VarType.FP32:
                sum_square_list_fp32.append(sum_square)
@@ -511,8 +518,12 @@ class ClipGradByGlobalNorm(ClipGradBase):
                sum_square_list.append(sum_square)

        # all parameters have been filterd out
-        if len(sum_square_list) + len(sum_square_list_fp16) + len(
-                sum_square_list_fp32) == 0:
+        if (
+            len(sum_square_list)
+            + len(sum_square_list_fp16)
+            + len(sum_square_list_fp32)
+            == 0
+        ):
            return params_grads

        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
@@ -531,22 +542,23 @@ class ClipGradByGlobalNorm(ClipGradBase):
            global_norm_var.append(global_norm_var_fp64)
        global_norm_var = paddle.add_n(global_norm_var)
        global_norm_var = layers.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(shape=[1],
-                                               dtype=global_norm_var.dtype,
-                                               value=self.clip_norm)
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
+        )

        need_clip = False
        if not self.auto_skip_clip:  # always apply clip
            need_clip = True
-            clip_var = layers.elementwise_div(x=max_global_norm,
-                                              y=layers.elementwise_max(
-                                                  x=global_norm_var,
-                                                  y=max_global_norm))
+            clip_var = layers.elementwise_div(
+                x=max_global_norm,
+                y=layers.elementwise_max(x=global_norm_var, y=max_global_norm),
+            )
        elif global_norm_var > max_global_norm:
            # only when global_norm_var > max_global_norm, grad need clip
            need_clip = True
-            clip_var = layers.elementwise_div(x=max_global_norm,
-                                              y=global_norm_var)
+            clip_var = layers.elementwise_div(
+                x=max_global_norm, y=global_norm_var
+            )

        for p, g in params_grads:
            if g is None:
@@ -556,8 +568,11 @@ class ClipGradByGlobalNorm(ClipGradBase):
                continue
            # TODO(wangxi): use inplace elementwise_mul
            if need_clip:
-                clip_input = (clip_var.astype(g.dtype)
-                              if clip_var.dtype != g.dtype else clip_var)
+                clip_input = (
+                    clip_var.astype(g.dtype)
+                    if clip_var.dtype != g.dtype
+                    else clip_var
+                )
                new_grad = layers.elementwise_mul(g, clip_input)
                params_and_grads.append((p, new_grad))
            else:
@@ -581,7 +596,8 @@ class ClipGradByGlobalNorm(ClipGradBase):
                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                        merge_grad = layers.merge_selected_rows(g)
                        merge_grad = layers.get_tensor_from_selected_rows(
-                            merge_grad)
+                            merge_grad
+                        )
                    sum_square = _squared_l2_norm(merge_grad)
                    if sum_square.dtype == core.VarDesc.VarType.FP16:
                        sum_square_list_fp16.append(sum_square)
@@ -591,8 +607,12 @@ class ClipGradByGlobalNorm(ClipGradBase):
                        sum_square_list.append(sum_square)

            # all parameters have been filterd out
-            if len(sum_square_list) + len(sum_square_list_fp16) + len(
-                    sum_square_list_fp32) == 0:
+            if (
+                len(sum_square_list)
+                + len(sum_square_list_fp16)
+                + len(sum_square_list_fp32)
+                == 0
+            ):
                return params_grads

            with p.block.program._optimized_guard([p, g]):
@@ -601,10 +621,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
                global_norm_var = []
                if len(sum_square_list_fp16) > 0:
                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
-                    if sum_square_list_fp32 or sum_square_list or not _allow_pure_fp16_global_norm_clip(
+                    if (
+                        sum_square_list_fp32
+                        or sum_square_list
+                        or not _allow_pure_fp16_global_norm_clip()
                    ):
                        global_norm_var.append(
-                            global_norm_var_fp16.astype(sum_dtype))
+                            global_norm_var_fp16.astype(sum_dtype)
+                        )
                    else:
                        global_norm_var.append(global_norm_var_fp16)
                if len(sum_square_list_fp32) > 0:
@@ -613,23 +637,28 @@ class ClipGradByGlobalNorm(ClipGradBase):
                        global_norm_var.append(global_norm_var_fp32)
                    else:
                        global_norm_var.append(
-                            global_norm_var_fp32.astype(sum_dtype))
+                            global_norm_var_fp32.astype(sum_dtype)
+                        )
                if len(sum_square_list) > 0:
                    # fp64
                    global_norm_var_other_dtype = layers.sums(sum_square_list)
                    global_norm_var.append(global_norm_var_other_dtype)

-                global_norm_var = layers.sums(global_norm_var) if len(
-                    global_norm_var) > 1 else global_norm_var[0]
+                global_norm_var = (
+                    layers.sums(global_norm_var)
+                    if len(global_norm_var) > 1
+                    else global_norm_var[0]
+                )
                global_norm_var = layers.sqrt(x=global_norm_var)
                max_global_norm = layers.fill_constant(
-                    shape=[1],
-                    dtype=global_norm_var.dtype,
-                    value=self.clip_norm)
-                scale_var = layers.elementwise_div(x=max_global_norm,
-                                                   y=layers.elementwise_max(
+                    shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
+                )
+                scale_var = layers.elementwise_div(
                    x=max_global_norm,
-                                                       y=global_norm_var))
+                    y=layers.elementwise_max(
+                        x=max_global_norm, y=global_norm_var
+                    ),
+                )
            param_new_grad_name_dict = dict()
            for p, g in params_grads:
                if g is None:
@@ -641,29 +670,32 @@ class ClipGradByGlobalNorm(ClipGradBase):
                with p.block.program._optimized_guard([p, g]):
                    new_g = _cast_to_mp_type_if_enabled(g)
                    # inplace
-                    scale_input = (scale_var.astype('float16') if
-                                   new_g.dtype == core.VarDesc.VarType.FP16 and
-                                   scale_var.dtype != core.VarDesc.VarType.FP16
-                                   else scale_var)
+                    scale_input = (
+                        scale_var.astype('float16')
+                        if new_g.dtype == core.VarDesc.VarType.FP16
+                        and scale_var.dtype != core.VarDesc.VarType.FP16
+                        else scale_var
+                    )
                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
                    # will be in different blocks with the gradient clip related ops.
                    # We need to handle the correct block, otherwise will encounter
                    # a 'NotFoundError' during compile time.
                    block = default_main_program().current_block()
-                    block.append_op(type='elementwise_mul',
-                                    inputs={
-                                        'X': new_g,
-                                        'Y': scale_input
-                                    },
-                                    outputs={'Out': new_g})
+                    block.append_op(
+                        type='elementwise_mul',
+                        inputs={'X': new_g, 'Y': scale_input},
+                        outputs={'Out': new_g},
+                    )
                    if new_g is not g:
-                        block.append_op(type='cast',
+                        block.append_op(
+                            type='cast',
                            inputs={'X': new_g},
                            outputs={'Out': g},
                            attrs={
                                'in_dtype': new_g.dtype,
-                                            'out_dtype': g.dtype
-                                        })
+                                'out_dtype': g.dtype,
+                            },
+                        )

                param_new_grad_name_dict[p.name] = g.name
                params_and_grads.append((p, g))
@@ -676,7 +708,8 @@ class ClipGradByGlobalNorm(ClipGradBase):
            context[self.group_name] = []
            context[self.group_name + "_clip_value"] = self.clip_norm
            context[self.group_name + "_clip"] = layers.fill_constant(
-                shape=[1], dtype=grad.dtype, value=self.clip_norm)
+                shape=[1], dtype=grad.dtype, value=self.clip_norm
+            )
        else:
            if not self.clip_norm == context[self.group_name + "_clip_value"]:
                raise ValueError(
@@ -699,20 +732,19 @@ class ClipGradByGlobalNorm(ClipGradBase):
            group_norm_var = layers.sums(input=self.context[self.group_name])
            group_norm_var = layers.sqrt(x=group_norm_var)
            clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = layers.elementwise_div(x=clip_var,
-                                                     y=layers.elementwise_max(
+            group_scale_var = layers.elementwise_div(
                x=clip_var,
-                                                         y=group_norm_var))
-            assert group_scale_var.shape == (1, )
+                y=layers.elementwise_max(x=clip_var, y=group_norm_var),
+            )
+            assert group_scale_var.shape == (1,)
            self.context[group_scale_name] = group_scale_var

        # inplace
-        param.block.append_op(type='elementwise_mul',
-                              inputs={
-                                  'X': grad,
-                                  'Y': self.context[group_scale_name]
-                              },
-                              outputs={'Out': grad})
+        param.block.append_op(
+            type='elementwise_mul',
+            inputs={'X': grad, 'Y': self.context[group_scale_name]},
+            outputs={'Out': grad},
+        )

        return param, grad

@@ -807,22 +839,26 @@ def set_gradient_clip(clip, param_list=None, program=None):


    """
-    warnings.warn("Caution! 'set_gradient_clip' is not recommended "
+    warnings.warn(
+        "Caution! 'set_gradient_clip' is not recommended "
        "and may be deprecated in future! "
        "We recommend a new strategy: set 'grad_clip' "
        "when initializing the 'optimizer'. "
        "This method can reduce the mistakes, please "
-                  "refer to documention of 'optimizer'.")
+        "refer to documention of 'optimizer'."
+    )

    if not isinstance(clip, ClipGradBase):
        raise TypeError(
-            "'clip' should be an instance of ClipGradBase's derived class")
+            "'clip' should be an instance of ClipGradBase's derived class"
+        )
    if program is None:
        program = framework.default_main_program()

    for op in program.block(0).ops:
        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
-                "op_namescope"):
+            "op_namescope"
+        ):
            warnings.warn(
                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
@@ -847,14 +883,16 @@ def append_gradient_clip_ops(param_grads):
    for p, g in param_grads:
        if g is None:
            continue
-        with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('gradient_clip'):
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
            clip_attr = getattr(p, 'gradient_clip_attr', None)
            if clip_attr is None:
                return param_grads
            if not isinstance(clip_attr, ClipGradBase):
                raise TypeError(
-                    "clip attribute should be an instance of GradientClipBase")
+                    "clip attribute should be an instance of GradientClipBase"
+                )

            clip_attr._process_context(context=context, param=p, grad=g)

@@ -863,8 +901,9 @@ def append_gradient_clip_ops(param_grads):
    for p, g in param_grads:
        if g is None:
            continue
-        with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('gradient_clip'):
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
            param, new_grad = clip_attr._create_operators(param=p, grad=g)
            param_new_grad_name_dict[param.name] = new_grad.name
            res.append([param, new_grad])
@@ -888,12 +927,16 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
            continue
        block_id_list.append(block_id)
        for op in param.block.program.global_block().ops:
-            if op.has_attr("op_namescope") and "gradient_clip" in op.attr(
-                    "op_namescope") and op.attr('op_role_var'):
+            if (
+                op.has_attr("op_namescope")
+                and "gradient_clip" in op.attr("op_namescope")
+                and op.attr('op_role_var')
+            ):
                param_name = op.attr('op_role_var')[0]
                if param_name in param_new_grad_name_dict:
                    correct_p_g = [
-                        param_name, param_new_grad_name_dict[param_name]
+                        param_name,
+                        param_new_grad_name_dict[param_name],
                    ]
                    op._set_attr('op_role_var', correct_p_g)


--- a/python/paddle/fluid/tests/unittests/test_add_n_op.py
+++ b/python/paddle/fluid/tests/unittests/test_add_n_op.py
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestAddnOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(20)
+        l = 32
+        self.x_np = np.random.random([l, 16, 256])
+
+    def check_main(self, x_np, dtype, axis=None):
+        paddle.disable_static()
+        x = []
+        for i in range(x_np.shape[0]):
+            val = paddle.to_tensor(x_np[i].astype(dtype))
+            val.stop_gradient = False
+            x.append(val)
+        y = paddle.add_n(x)
+        x_g = paddle.grad(y, x)
+        y_np = y.numpy().astype('float32')
+        x_g_np = []
+        for val in x_g:
+            x_g_np.append(val.numpy().astype('float32'))
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_add_n_fp16(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        y_np_16, x_g_np_16 = self.check_main(self.x_np, 'float16')
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+
+        np.testing.assert_allclose(y_np_16, y_np_32, rtol=1e-03)
+        for i in range(len(x_g_np_32)):
+            np.testing.assert_allclose(x_g_np_16[i], x_g_np_32[i], rtol=1e-03)
+
+    def test_add_n_api(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+        y_np_gt = np.sum(self.x_np, axis=0).astype('float32')
+
+        np.testing.assert_allclose(y_np_32, y_np_gt, rtol=1e-06)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -440,16 +440,22 @@ class Optimizer(object):
        return self._opti_name_list

    def _create_global_learning_rate(self):
-        # lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
+        # lr var can't be float16 or bfloat16, for pure fp16 or fp16 training, should extra handle the dtype for lr
        _lr_dtype = (
            paddle.get_default_dtype() if self._dtype is None else self._dtype
        )
        _lr_dtype = (
            paddle.float32
            if (
+                (
                    paddle.get_default_dtype() != "float16"
                    and _lr_dtype == paddle.float16
                )
+                or (
+                    paddle.get_default_dtype() != "bfloat16"
+                    and _lr_dtype == paddle.bfloat16
+                )
+            )
            else _lr_dtype
        )
        if isinstance(self._learning_rate, LRScheduler):