未验证 提交 e8ac7fc3 编写于 作者: Z zhangbo9674 提交者: GitHub

[bf16] add bf16 kernel: dropout & reshape & slice (#39395)

* add dropout

* add reshape

* add slice

* refien slice unittest

* refine slice unittest

* add cpu bf16 kernel
上级 14ed2f54
...@@ -179,8 +179,12 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ...@@ -179,8 +179,12 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>, dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>); ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout_grad, dropout_grad,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>, ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>); ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/dropout_impl.cu.h" #include "paddle/fluid/operators/dropout_impl.cu.h"
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
...@@ -84,8 +85,10 @@ namespace plat = paddle::platform; ...@@ -84,8 +85,10 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>, dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>, ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, double>); ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>, dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>); ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
...@@ -698,13 +698,14 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ...@@ -698,13 +698,14 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel, ops::ReshapeKernel, int, ops::ReshapeKernel,
uint8_t, ops::ReshapeKernel, int64_t, uint8_t, ops::ReshapeKernel, int64_t,
ops::ReshapeKernel, plat::float16, ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel, plat::bfloat16,
ops::ReshapeKernel); ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int, double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel, uint8_t, ops::ReshapeGradKernel, uint8_t,
ops::ReshapeGradKernel, plat::float16, ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel, plat::bfloat16,
ops::ReshapeGradKernel); ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel, ops::ReshapeKernel, int, ops::ReshapeKernel,
...@@ -712,13 +713,15 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, ...@@ -712,13 +713,15 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, plat::float16, ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel, bool, ops::ReshapeKernel, ops::ReshapeKernel, bool, ops::ReshapeKernel,
plat::complex<float>, ops::ReshapeKernel, plat::complex<float>, ops::ReshapeKernel,
plat::complex<double>, ops::ReshapeKernel); plat::complex<double>, ops::ReshapeKernel,
plat::bfloat16, ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR( REGISTER_OP_CUDA_KERNEL_FUNCTOR(
reshape2_grad, float, ops::ReshapeGradKernel, double, reshape2_grad, float, ops::ReshapeGradKernel, double,
ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t, ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16, ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>, ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>,
ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel); ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel,
plat::bfloat16, ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR( REGISTER_OP_CUDA_KERNEL_FUNCTOR(
reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double, reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
...@@ -727,7 +730,7 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR( ...@@ -727,7 +730,7 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(
plat::float16, ops::ReshapeDoubleGradKernel, bool, plat::float16, ops::ReshapeDoubleGradKernel, bool,
ops::ReshapeDoubleGradKernel, plat::complex<float>, ops::ReshapeDoubleGradKernel, plat::complex<float>,
ops::ReshapeDoubleGradKernel, plat::complex<double>, ops::ReshapeDoubleGradKernel, plat::complex<double>,
ops::ReshapeDoubleGradKernel); ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel);
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
......
...@@ -442,7 +442,9 @@ REGISTER_OP_CPU_KERNEL( ...@@ -442,7 +442,9 @@ REGISTER_OP_CPU_KERNEL(
ops::SliceKernel<paddle::platform::CPUDeviceContext, ops::SliceKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::SliceKernel<paddle::platform::CPUDeviceContext, ops::SliceKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>); paddle::platform::complex<double>>,
ops::SliceKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, bool>, slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
...@@ -453,7 +455,9 @@ REGISTER_OP_CPU_KERNEL( ...@@ -453,7 +455,9 @@ REGISTER_OP_CPU_KERNEL(
ops::SliceGradKernel<paddle::platform::CPUDeviceContext, ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::SliceGradKernel<paddle::platform::CPUDeviceContext, ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>); paddle::platform::complex<double>>,
ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, bool>, slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, bool>,
...@@ -463,6 +467,8 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -463,6 +467,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, ops::SliceKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>, paddle::platform::float16>,
ops::SliceKernel<paddle::platform::CUDADeviceContext,
paddle::platform::bfloat16>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, ops::SliceKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, ops::SliceKernel<paddle::platform::CUDADeviceContext,
...@@ -476,6 +482,8 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -476,6 +482,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>, paddle::platform::float16>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::bfloat16>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>, paddle::platform::complex<float>>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
......
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/pten/common/bfloat16.h"
#include "paddle/pten/common/complex.h" #include "paddle/pten/common/complex.h"
#include "paddle/pten/kernels/funcs/eigen/eigen_function.h" #include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
...@@ -61,6 +62,7 @@ INSTANTIATION(EigenPad, int); ...@@ -61,6 +62,7 @@ INSTANTIATION(EigenPad, int);
INSTANTIATION(EigenPad, int64_t); INSTANTIATION(EigenPad, int64_t);
INSTANTIATION(EigenPad, float); INSTANTIATION(EigenPad, float);
INSTANTIATION(EigenPad, double); INSTANTIATION(EigenPad, double);
INSTANTIATION(EigenPad, dtype::bfloat16);
INSTANTIATION(EigenPad, dtype::complex<float>); INSTANTIATION(EigenPad, dtype::complex<float>);
INSTANTIATION(EigenPad, dtype::complex<double>); INSTANTIATION(EigenPad, dtype::complex<double>);
#undef INSTANTIATION #undef INSTANTIATION
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest, skip_check_grad_ci from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
import paddle import paddle
import paddle.static as static import paddle.static as static
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -233,6 +233,27 @@ class TestFP16DropoutOp2(TestFP16DropoutOp): ...@@ -233,6 +233,27 @@ class TestFP16DropoutOp2(TestFP16DropoutOp):
self.fix_seed = False self.fix_seed = False
class TestBF16DropoutOp(OpTest):
def setUp(self):
self.op_type = "dropout"
self.dtype = np.uint16
x = np.random.random((32, 64)).astype("float32")
self.inputs = {'X': convert_float_to_uint16(x)}
self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
self.outputs = {
'Out':
convert_float_to_uint16(np.zeros((32, 64)).astype('float32')),
'Mask': np.zeros((32, 64)).astype('uint8')
}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['X'], 'Out')
class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase): class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
def test_seed_cpu_place(self): def test_seed_cpu_place(self):
paddle.enable_static() paddle.enable_static()
......
...@@ -17,11 +17,12 @@ from __future__ import print_function ...@@ -17,11 +17,12 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import compiler from paddle.fluid import compiler
from paddle.static import Program, program_guard from paddle.static import Program, program_guard
import paddle.fluid.core as core
# situation 1: have shape( list, no tensor), no actual shape(Tensor) # situation 1: have shape( list, no tensor), no actual shape(Tensor)
...@@ -48,6 +49,33 @@ class TestReshapeOp(OpTest): ...@@ -48,6 +49,33 @@ class TestReshapeOp(OpTest):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
class TestReshapeBF16Op(OpTest):
def setUp(self):
self.init_data()
self.op_type = "reshape2"
self.dtype = np.uint16
x = np.random.random(self.ori_shape).astype("float32")
out = x.reshape(self.infered_shape)
self.inputs = {"X": convert_float_to_uint16(x)}
self.attrs = {"shape": self.new_shape}
self.outputs = {
"Out": convert_float_to_uint16(out),
'XShape': convert_float_to_uint16(
np.random.random(self.ori_shape).astype("float32"))
}
def init_data(self):
self.ori_shape = (2, 60)
self.new_shape = (12, 10)
self.infered_shape = (12, 10)
def test_check_output(self):
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer1(TestReshapeOp): class TestReshapeOpDimInfer1(TestReshapeOp):
def init_data(self): def init_data(self):
self.ori_shape = (5, 25) self.ori_shape = (5, 25)
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle import paddle
...@@ -484,6 +484,35 @@ class TestFP16_2(OpTest): ...@@ -484,6 +484,35 @@ class TestFP16_2(OpTest):
numeric_grad_delta=0.5) numeric_grad_delta=0.5)
class TestBF16(OpTest):
def setUp(self):
self.op_type = "slice"
self.config()
self.inputs = {'Input': convert_float_to_uint16(self.input)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
self.attrs = {
'axes': self.axes,
'starts': self.starts,
'ends': self.ends,
'infer_flags': self.infer_flags
}
def config(self):
self.dtype = np.uint16
self.input = np.random.random([3, 4, 5, 6]).astype(np.float32)
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
self.out = self.input[-3:3, 0:100, :, 2:-1]
self.infer_flags = [1, 1, 1]
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['Input'], 'Out')
# Test python API # Test python API
class TestSliceAPI(unittest.TestCase): class TestSliceAPI(unittest.TestCase):
def test_1(self): def test_1(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册