From e8ac7fc30a25e5d4626d8b483bf936bb9abe2e93 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 10 Feb 2022 13:26:09 +0800
Subject: [PATCH] [bf16] add bf16 kernel: dropout & reshape & slice (#39395)

* add dropout

* add reshape

* add slice

* refien slice unittest

* refine slice unittest

* add cpu bf16 kernel
---
 paddle/fluid/operators/dropout_op.cc          |  8 +++--
 paddle/fluid/operators/dropout_op.cu          |  3 ++
 paddle/fluid/operators/reshape_op.cc          | 11 ++++---
 paddle/fluid/operators/slice_op.cc            | 12 +++++--
 paddle/pten/kernels/funcs/eigen/pad.cc        |  2 ++
 .../fluid/tests/unittests/test_dropout_op.py  | 23 +++++++++++++-
 .../fluid/tests/unittests/test_reshape_op.py  | 30 +++++++++++++++++-
 .../fluid/tests/unittests/test_slice_op.py    | 31 ++++++++++++++++++-
 8 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index cbfb795d6a..7613b04bcc 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -179,8 +179,12 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
+                          paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     dropout_grad,
     ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
+                           paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 0d5ee41c5c..f6ddff1d03 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -84,8 +85,10 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
     ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
     ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 43da63aae7..1ef90ff2b7 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -698,13 +698,14 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
                                 uint8_t, ops::ReshapeKernel, int64_t,
                                 ops::ReshapeKernel, plat::float16,
+                                ops::ReshapeKernel, plat::bfloat16,
                                 ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
                                 ops::ReshapeGradKernel, uint8_t,
                                 ops::ReshapeGradKernel, plat::float16,
-
+                                ops::ReshapeGradKernel, plat::bfloat16,
                                 ops::ReshapeGradKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -712,13 +713,15 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, plat::float16,
                                 ops::ReshapeKernel, bool, ops::ReshapeKernel,
                                 plat::complex<float>, ops::ReshapeKernel,
-                                plat::complex<double>, ops::ReshapeKernel);
+                                plat::complex<double>, ops::ReshapeKernel,
+                                plat::bfloat16, ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(
     reshape2_grad, float, ops::ReshapeGradKernel, double,
     ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
     ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16,
     ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>,
-    ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel);
+    ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel,
+    plat::bfloat16, ops::ReshapeGradKernel);
 
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(
     reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
@@ -727,7 +730,7 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(
     plat::float16, ops::ReshapeDoubleGradKernel, bool,
     ops::ReshapeDoubleGradKernel, plat::complex<float>,
     ops::ReshapeDoubleGradKernel, plat::complex<double>,
-    ops::ReshapeDoubleGradKernel);
+    ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel);
 #endif
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 4965e5e156..37c79b9898 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -442,7 +442,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::SliceKernel<paddle::platform::CPUDeviceContext,
                      paddle::platform::complex<float>>,
     ops::SliceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<double>>);
+                     paddle::platform::complex<double>>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::bfloat16>);
 
 REGISTER_OP_CPU_KERNEL(
     slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
@@ -453,7 +455,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
                          paddle::platform::complex<float>>,
     ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>);
+                         paddle::platform::complex<double>>,
+    ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::bfloat16>);
 
 REGISTER_OP_CUDA_KERNEL(
     slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, bool>,
@@ -463,6 +467,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::SliceKernel<paddle::platform::CUDADeviceContext,
                      paddle::platform::float16>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::bfloat16>,
     ops::SliceKernel<paddle::platform::CUDADeviceContext,
                      paddle::platform::complex<float>>,
     ops::SliceKernel<paddle::platform::CUDADeviceContext,
@@ -476,6 +482,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
                          paddle::platform::float16>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::bfloat16>,
     ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
                          paddle::platform::complex<float>>,
     ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/pten/kernels/funcs/eigen/pad.cc b/paddle/pten/kernels/funcs/eigen/pad.cc
index ed4b1c0643..a051c3d968 100644
--- a/paddle/pten/kernels/funcs/eigen/pad.cc
+++ b/paddle/pten/kernels/funcs/eigen/pad.cc
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/pten/common/bfloat16.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
 
@@ -61,6 +62,7 @@ INSTANTIATION(EigenPad, int);
 INSTANTIATION(EigenPad, int64_t);
 INSTANTIATION(EigenPad, float);
 INSTANTIATION(EigenPad, double);
+INSTANTIATION(EigenPad, dtype::bfloat16);
 INSTANTIATION(EigenPad, dtype::complex<float>);
 INSTANTIATION(EigenPad, dtype::complex<double>);
 #undef INSTANTIATION
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index cb72248b15..f670f7c380 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle
 import paddle.static as static
 import paddle.fluid as fluid
@@ -233,6 +233,27 @@ class TestFP16DropoutOp2(TestFP16DropoutOp):
         self.fix_seed = False
 
 
+class TestBF16DropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.dtype = np.uint16
+
+        x = np.random.random((32, 64)).astype("float32")
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out':
+            convert_float_to_uint16(np.zeros((32, 64)).astype('float32')),
+            'Mask': np.zeros((32, 64)).astype('uint8')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
     def test_seed_cpu_place(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index a0063738d3..c860d6972f 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -17,11 +17,12 @@ from __future__ import print_function
 import unittest
 import numpy as np
 
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 from paddle.static import Program, program_guard
+import paddle.fluid.core as core
 
 
 # situation 1: have shape( list, no tensor), no actual shape(Tensor)
@@ -48,6 +49,33 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestReshapeBF16Op(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+        self.dtype = np.uint16
+        x = np.random.random(self.ori_shape).astype("float32")
+        out = x.reshape(self.infered_shape)
+        self.inputs = {"X": convert_float_to_uint16(x)}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": convert_float_to_uint16(out),
+            'XShape': convert_float_to_uint16(
+                np.random.random(self.ori_shape).astype("float32"))
+        }
+
+    def init_data(self):
+        self.ori_shape = (2, 60)
+        self.new_shape = (12, 10)
+        self.infered_shape = (12, 10)
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
 class TestReshapeOpDimInfer1(TestReshapeOp):
     def init_data(self):
         self.ori_shape = (5, 25)
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 57d5453ec9..629d61d01b 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle
@@ -484,6 +484,35 @@ class TestFP16_2(OpTest):
                 numeric_grad_delta=0.5)
 
 
+class TestBF16(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': convert_float_to_uint16(self.input)}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.dtype = np.uint16
+        self.input = np.random.random([3, 4, 5, 6]).astype(np.float32)
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+        self.infer_flags = [1, 1, 1]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['Input'], 'Out')
+
+
 # Test python API
 class TestSliceAPI(unittest.TestCase):
     def test_1(self):
-- 
GitLab