diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc index 3aee2c6504fdd7bf2a803f8757a974728aa262dd..f7b64f16e2d8bc42063685bd62e9d2bddc6fbd33 100644 --- a/paddle/fluid/operators/concat_op.cu.cc +++ b/paddle/fluid/operators/concat_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" @@ -25,6 +26,7 @@ REGISTER_OP_CUDA_KERNEL( ops::ConcatGradKernel, ops::ConcatGradKernel, ops::ConcatGradKernel, + ops::ConcatGradKernel, ops::ConcatGradKernel, ops::ConcatGradKernel, ops::ConcatGradKernel, diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc index d1da64b158c145e8cfa9b7343ce8ddf8af77777f..a8a1383614bddb24b285734edb6f74e2789fdfeb 100644 --- a/paddle/fluid/operators/split_op.cu.cc +++ b/paddle/fluid/operators/split_op.cu.cc @@ -21,4 +21,5 @@ REGISTER_OP_CUDA_KERNEL( ops::SplitOpKernel, ops::SplitOpKernel, ops::SplitOpKernel, - ops::SplitOpKernel); + ops::SplitOpKernel, + ops::SplitOpKernel); diff --git a/paddle/pten/kernels/gpu/concat_kernel.cu b/paddle/pten/kernels/gpu/concat_kernel.cu index e52e3a3d6446c7debdc0fa603ba326173f064181..093af0d54f6eb36633e52f9aef90068275dea3dd 100644 --- a/paddle/pten/kernels/gpu/concat_kernel.cu +++ b/paddle/pten/kernels/gpu/concat_kernel.cu @@ -121,5 +121,6 @@ PT_REGISTER_KERNEL(concat, int, uint8_t, paddle::platform::float16, + paddle::platform::bfloat16, paddle::platform::complex, paddle::platform::complex) {} diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 5f936e577a06fd611a149a2501be7bd845cc7905..10b7e13dcc334dbc6b2f7b4c614cf888168c34ab 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard, core import paddle @@ -44,17 +44,35 @@ class TestConcatOp(OpTest): return "float64" def test_check_output(self): - self.check_output() + if self.dtype == np.uint16: + place = core.CUDAPlace(0) + self.check_output_with_place(place) + else: + self.check_output() def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') + if self.dtype == np.uint16: + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['x0'], 'Out') + self.check_grad_with_place(place, ['x1'], 'Out') + self.check_grad_with_place(place, ['x2'], 'Out') + else: + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') def init_test_data(self): - self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype) - self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype) - self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype) + if self.dtype == np.uint16: + x0 = np.random.random((5, 1, 4, 5)).astype(np.float32) + self.x0 = convert_float_to_uint16(x0) + x1 = np.random.random((5, 2, 4, 5)).astype(np.float32) + self.x1 = convert_float_to_uint16(x1) + x2 = np.random.random((5, 3, 4, 5)).astype(np.float32) + self.x2 = convert_float_to_uint16(x2) + else: + self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype) self.axis = 1 @@ -193,6 +211,22 @@ create_test_fp16(TestConcatOp5) create_test_fp16(TestConcatOp6) +#----------------Concat Bf16---------------- +def create_test_bf16(parent): + @unittest.skipIf(not paddle.is_compiled_with_cuda(), + "core is not compiled with CUDA") + class TestConcatBf16(parent): + def get_dtype(self): + return np.uint16 + + cls_name = "{0}_{1}".format(parent.__name__, "Bf16") + TestConcatBf16.__name__ = cls_name + globals()[cls_name] = TestConcatBf16 + + +create_test_bf16(TestConcatOp) + + class TestConcatOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py index b261ce93c0a63a2feff5421ab0a90fbd23c09ae9..aac904dc2e15d47d2d2439142363afcaae9e2d67 100644 --- a/python/paddle/fluid/tests/unittests/test_split_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import paddle import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard, core @@ -26,12 +26,19 @@ class TestSplitOp(OpTest): self._set_op_type() self.dtype = self.get_dtype() axis = 1 - x = np.random.random((4, 5, 6)).astype(self.dtype) - out = np.split(x, [2, 3], axis) - self.inputs = {'X': x} + if self.dtype == np.uint16: + x = np.random.random((4, 5, 6)).astype(np.float32) + out = np.split(x, [2, 3], axis) + self.inputs = {'X': convert_float_to_uint16(x)} + self.outputs = {'Out': [('out%d' % i, convert_float_to_uint16(out[i])) \ + for i in range(len(out))]} + else: + x = np.random.random((4, 5, 6)).astype(self.dtype) + out = np.split(x, [2, 3], axis) + self.inputs = {'X': x} + self.outputs = {'Out': [('out%d' % i, out[i]) \ + for i in range(len(out))]} self.attrs = {'axis': axis, 'sections': [2, 1, 2]} - self.outputs = {'Out': [('out%d' % i, out[i]) \ - for i in range(len(out))]} def get_dtype(self): return "float64" @@ -226,6 +233,30 @@ def create_test_fp16(parent): create_test_fp16(TestSplitOp) +#----------------Split Bf16---------------- + + +def create_test_bf16(parent): + @unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + class TestSplitBf16(parent): + def get_dtype(self): + return np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + pass + + cls_name = "{0}_{1}".format(parent.__name__, "Bf16") + TestSplitBf16.__name__ = cls_name + globals()[cls_name] = TestSplitBf16 + + +create_test_bf16(TestSplitOp) + class TestSplitAPI(unittest.TestCase): def test_api(self):