未验证 提交 1e6047f1 编写于 作者: Z zhangbo9674 提交者: GitHub

[bf16] add bf16 kernel: transpose & unbind (#39457)

* add transpose unbind

* add unittest

* refine transpose unittest
上级 89aa8b1a
...@@ -356,7 +356,9 @@ REGISTER_OP_CPU_KERNEL( ...@@ -356,7 +356,9 @@ REGISTER_OP_CPU_KERNEL(
ops::TransposeKernel<paddle::platform::CPUDeviceContext, ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>); paddle::platform::complex<double>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>, ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
...@@ -365,7 +367,9 @@ REGISTER_OP_CPU_KERNEL( ...@@ -365,7 +367,9 @@ REGISTER_OP_CPU_KERNEL(
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>); paddle::platform::complex<double>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
ops::Transpose2GradMaker<paddle::framework::OpDesc>, ops::Transpose2GradMaker<paddle::framework::OpDesc>,
...@@ -383,7 +387,9 @@ REGISTER_OP_CPU_KERNEL( ...@@ -383,7 +387,9 @@ REGISTER_OP_CPU_KERNEL(
ops::TransposeKernel<paddle::platform::CPUDeviceContext, ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>); paddle::platform::complex<double>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose2_grad, transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>, ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
...@@ -394,4 +400,6 @@ REGISTER_OP_CPU_KERNEL( ...@@ -394,4 +400,6 @@ REGISTER_OP_CPU_KERNEL(
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>); paddle::platform::complex<double>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.cu.h" #include "paddle/fluid/operators/transpose_op.cu.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
...@@ -87,6 +88,8 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -87,6 +88,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
...@@ -98,6 +101,8 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -98,6 +101,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
plat::float16>, plat::float16>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
...@@ -111,6 +116,8 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -111,6 +116,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
...@@ -124,6 +131,8 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -124,6 +131,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
plat::float16>, plat::float16>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
......
...@@ -85,4 +85,5 @@ REGISTER_OP_CPU_KERNEL( ...@@ -85,4 +85,5 @@ REGISTER_OP_CPU_KERNEL(
ops::UnbindOpKernel<plat::CPUDeviceContext, float>, ops::UnbindOpKernel<plat::CPUDeviceContext, float>,
ops::UnbindOpKernel<plat::CPUDeviceContext, int64_t>, ops::UnbindOpKernel<plat::CPUDeviceContext, int64_t>,
ops::UnbindOpKernel<plat::CPUDeviceContext, int>, ops::UnbindOpKernel<plat::CPUDeviceContext, int>,
ops::UnbindOpKernel<plat::CPUDeviceContext, plat::float16>); ops::UnbindOpKernel<plat::CPUDeviceContext, plat::float16>,
ops::UnbindOpKernel<plat::CPUDeviceContext, plat::bfloat16>);
...@@ -20,4 +20,5 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -20,4 +20,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::UnbindOpKernel<plat::CUDADeviceContext, float>, ops::UnbindOpKernel<plat::CUDADeviceContext, float>,
ops::UnbindOpKernel<plat::CUDADeviceContext, int64_t>, ops::UnbindOpKernel<plat::CUDADeviceContext, int64_t>,
ops::UnbindOpKernel<plat::CUDADeviceContext, int>, ops::UnbindOpKernel<plat::CUDADeviceContext, int>,
ops::UnbindOpKernel<plat::CUDADeviceContext, plat::float16>); ops::UnbindOpKernel<plat::CUDADeviceContext, plat::float16>,
ops::UnbindOpKernel<plat::CUDADeviceContext, plat::bfloat16>);
...@@ -16,10 +16,11 @@ from __future__ import print_function ...@@ -16,10 +16,11 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import Program, program_guard from paddle.fluid import Program, program_guard
import paddle.fluid.core as core
paddle.enable_static() paddle.enable_static()
...@@ -113,6 +114,39 @@ class TestCase9(TestTransposeOp): ...@@ -113,6 +114,39 @@ class TestCase9(TestTransposeOp):
self.axis = (6, 1, 3, 5, 0, 2, 4, 7) self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
class TestTransposeBF16Op(OpTest):
def setUp(self):
self.init_op_type()
self.initTestCase()
self.dtype = np.uint16
x = np.random.random(self.shape).astype("float32")
self.inputs = {'X': convert_float_to_uint16(x)}
self.attrs = {
'axis': list(self.axis),
'use_mkldnn': self.use_mkldnn,
}
self.outputs = {
'XShape': convert_float_to_uint16(
np.random.random(self.shape).astype("float32")),
'Out': self.inputs['X'].transpose(self.axis)
}
def init_op_type(self):
self.op_type = "transpose2"
self.use_mkldnn = False
def test_check_output(self):
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
pass
def initTestCase(self):
self.shape = (3, 2)
self.axis = (1, 0)
class TestTransposeOpBool(TestTransposeOp): class TestTransposeOpBool(TestTransposeOp):
def test_check_grad(self): def test_check_grad(self):
pass pass
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.tensor as tensor import paddle.tensor as tensor
from paddle.fluid import compiler, Program, program_guard, core from paddle.fluid import compiler, Program, program_guard, core
...@@ -154,6 +154,32 @@ class TestUnbindOp4(TestUnbindOp): ...@@ -154,6 +154,32 @@ class TestUnbindOp4(TestUnbindOp):
self.out[1] = self.out[1].reshape((3, 2)) self.out[1] = self.out[1].reshape((3, 2))
class TestUnbindBF16Op(OpTest):
def setUp(self):
self._set_op_type()
self.dtype = self.get_dtype()
self.axis = 0
self.num = 3
x = np.arange(12).reshape(3, 2, 2).astype(self.dtype)
self.out = np.split(x, self.num, self.axis)
self.inputs = {'X': convert_float_to_uint16(x)}
self.attrs = {'axis': self.axis}
self.outputs = {'Out': [('out%d' % i, convert_float_to_uint16(self.out[i])) \
for i in range(len(self.out))]}
def get_dtype(self):
return np.uint16
def _set_op_type(self):
self.op_type = "unbind"
def test_check_output(self):
self.check_output()
def test_check_grad(self):
pass
class TestUnbindAxisError(unittest.TestCase): class TestUnbindAxisError(unittest.TestCase):
def test_errors(self): def test_errors(self):
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册