[AMP OP&Test] Norm bf16 (#51083)

90cb9a0d · 201716010711 · GitHub · 4c9e34dc · 90cb9a0d · 90cb9a0d
4 changed file
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -116,4 +116,5 @@ PD_REGISTER_KERNEL(norm_grad,
                   phi::NormGradKernel,
                   float,
                   double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -43,7 +43,7 @@ __global__ void Normalize(const T* x,
                          const int pre,
                          const int axis_n,  // dim in axis
                          const int post,
-                          const T eps,
+                          const float eps,
                          T* y,
                          T* out_norm) {
  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -86,7 +86,6 @@ void NormKernel(const Context& ctx,

  auto xdim = in_x->dims();
  if (axis < 0) axis = xdim.size() + axis;
-  T eps = static_cast<T>(epsilon);

  DenseTensor* out_norm;
  DenseTensor out_norm_tmp;
@@ -117,8 +116,8 @@ void NormKernel(const Context& ctx,
  int max_threads = ctx.GetMaxPhysicalThreadCount();
  const int max_blocks = std::max(max_threads / block, 1);
  int grid = std::min(max_blocks, pre * post);
-  Normalize<T, block>
-      <<<grid, block, 0, ctx.stream()>>>(x_ptr, pre, n, post, eps, y, norm_ptr);
+  Normalize<T, block><<<grid, block, 0, ctx.stream()>>>(
+      x_ptr, pre, n, post, epsilon, y, norm_ptr);
 }

 }  // namespace phi
@@ -129,4 +128,5 @@ PD_REGISTER_KERNEL(norm,
                   phi::NormKernel,
                   float,
                   double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -16,9 +16,11 @@ import unittest

 import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import convert_float_to_uint16

 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core


 def l2_norm(x, axis, epsilon):
@@ -157,6 +159,37 @@ class TestNormTestOp(OpTest):
        self.epsilon = 1e-8


+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestNormBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "norm"
+        self.python_api = norm_wrapper
+        self.init_test_case()
+        self.dtype = "float32"
+        x = np.random.random(self.shape).astype(self.dtype)
+        y, norm = l2_norm(x, self.axis, self.epsilon)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
+        self.outputs = {'Out': convert_float_to_uint16(y), 'Norm': norm}
+        self.python_out_sig = ['Out']
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0), atol=1e-1)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            core.CUDAPlace(0), ['X'], 'Out', max_relative_error=1e-2
+        )
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-8
+
+
 class API_NormTest(unittest.TestCase):
    def test_errors(self):
        with fluid.program_guard(fluid.Program()):

--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -148,12 +148,10 @@ class TestScaleFp16Op(TestScaleOp):
        self.dtype = np.float16

    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=True)
+        self.check_output(check_eager=True)

    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ["X"], "Out", check_eager=True)
+        self.check_grad(["X"], "Out", check_eager=True)


 class TestScaleBF16Op(OpTest):