From 98c427e2a5584191507c1bdce8baa0e9fc1dd88e Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Mon, 7 Mar 2022 11:38:43 +0800 Subject: [PATCH] [bf16] add bf16 kernel: sigmoid & sqrt & softplus & square (#40004) * add activ * refine unittest * refine unittest * refine unittest * refine unittest * refine code --- paddle/fluid/operators/activation_op.kps | 31 +++-- paddle/fluid/operators/amp/fp16_type_traits.h | 7 ++ paddle/fluid/operators/dropout_impl.cu.h | 3 +- paddle/phi/common/bfloat16.h | 4 + .../tests/unittests/test_activation_op.py | 113 ++++++++++++++++++ 5 files changed, 150 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index e1afb3919f8..3b7ce9eaf2b 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -1509,7 +1509,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor, \ grad_functor) \ @@ -1531,7 +1535,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); /* ======================== leaky relu register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, @@ -1650,7 +1658,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidDoubleGradKernel>, ops::SigmoidDoubleGradKernel>); + ops::SigmoidGradGradFunctor>, + ops::SigmoidDoubleGradKernel>); REGISTER_OP_CUDA_KERNEL( sigmoid_triple_grad, @@ -1659,7 +1669,10 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidTripleGradKernel>, ops::SigmoidTripleGradKernel>); + ops::SigmoidTripleGradFunctor>, + ops::SigmoidTripleGradKernel< + plat::CUDADeviceContext, + ops::SigmoidTripleGradFunctor>); /* ========================================================================== */ /* =========================== tanh register ============================ */ @@ -1696,7 +1709,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SqrtDoubleGradKernel>, ops::SqrtDoubleGradKernel>); + ops::SqrtGradGradFunctor>, + ops::SqrtDoubleGradKernel>); /* ========================================================================== */ /* =========================== rsqrt register ============================= @@ -1726,6 +1741,8 @@ REGISTER_OP_CUDA_KERNEL( ops::SquareGradGradFunctor>, ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel { using Type = float; }; +template <> +class MPTypeTrait { + public: + using Type = float; +}; + } // namespace details } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index dcdab033e8f..17665ad67e4 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -266,7 +266,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, cudaMemcpyDeviceToDevice, stream)); #endif } else { - T factor = static_cast(1.0f - dropout_prob); + using MT = typename details::MPTypeTrait::Type; + MT factor = static_cast(1.0f - dropout_prob); std::vector ins = {&x}; std::vector outs = {y}; auto functor = phi::funcs::ScaleFunctor(factor); diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index cf99bb8f19a..5f30ee4077b 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -310,6 +310,10 @@ HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { return !((isnan)(a)) && !((isinf)(a)); } +HOSTDEVICE inline bfloat16(abs)(const bfloat16& a) { + return bfloat16(std::abs(static_cast(a))); +} + inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) { os << static_cast(a); return os; diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index b4b5944e27c..5c40b898d23 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -183,6 +183,34 @@ class TestSigmoid(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.01) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSigmoidBF16(OpTest): + def setUp(self): + self.op_type = "sigmoid" + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32) + out = 1 / (1 + np.exp(-x)) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + class TestSilu(TestActivation): def setUp(self): self.op_type = "silu" @@ -945,6 +973,34 @@ class TestSqrt(TestActivation, TestParameter): self.check_grad(['X'], 'Out') +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSqrtBF16(OpTest): + def setUp(self): + self.op_type = "sqrt" + self.init_dtype() + + np.random.seed(1023) + x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32) + out = np.sqrt(x) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + class TestRsqrt(TestActivation): def setUp(self): self.op_type = "rsqrt" @@ -2195,6 +2251,34 @@ class TestSquare(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSquareBF16(OpTest): + def setUp(self): + self.op_type = "square" + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32) + out = np.square(x) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5) + + class TestPow(TestActivation): def setUp(self): self.op_type = "pow" @@ -2433,6 +2517,35 @@ class TestSoftplus(TestActivation): self.check_grad(['X'], 'Out') +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftplusBF16(OpTest): + def setUp(self): + self.op_type = "softplus" + self.init_dtype() + + beta = 2 + threshold = 15 + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(np.float32) + out = ref_softplus(x, beta, threshold) + self.inputs = {'X': convert_float_to_uint16(x)} + self.attrs = {'beta': beta, "threshold": threshold} + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.05) + + class TestSoftplusAPI(unittest.TestCase): # test paddle.nn.Softplus, paddle.nn.functional.softplus def setUp(self): -- GitLab