未验证 提交 5981bee2 编写于 作者: W wuhuanzhou 提交者: GitHub

conv2d support bfloat16 (#32221)

上级 0f154961
...@@ -211,20 +211,31 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -211,20 +211,31 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
args.cdesc.desc(), CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math";
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_TENSOR_OP_MATH)); CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math"; VLOG(5) << "use cudnn_tensor_op_math";
} else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
#if CUDA_VERSION >= 11000 #if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
} else if (dev_ctx.GetComputeCapability() >= 80 &&
dtype == CUDNN_DATA_BFLOAT16) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math";
#endif // CUDNN_VERSION >= 8100
} else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_FMA_MATH)); CUDNN_FMA_MATH));
VLOG(5) << "use cudnn_fma_math";
#endif // CUDA_VERSION >= 11000 #endif // CUDA_VERSION >= 11000
} else {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_DEFAULT_MATH));
VLOG(5) << "use cudnn_default_math";
} }
#endif #endif
......
...@@ -1413,6 +1413,31 @@ REGISTER_OP_KERNEL( ...@@ -1413,6 +1413,31 @@ REGISTER_OP_KERNEL(
paddle::operators::CUDNNConvDoubleGradOpKernel<float>, paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>); paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
#else #else
#if CUDNN_VERSION_MIN(8, 1, 0)
REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>,
paddle::operators::CUDNNConvOpKernel<plat::float16>,
paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
REGISTER_OP_KERNEL(
conv2d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad_grad,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
#else
REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>, paddle::operators::CUDNNConvOpKernel<double>,
...@@ -1432,6 +1457,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -1432,6 +1457,7 @@ REGISTER_OP_CUDA_KERNEL(
paddle::operators::CUDNNConvDoubleGradOpKernel<float>, paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>, paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>); paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
#endif
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
......
...@@ -199,6 +199,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -199,6 +199,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"float16 can only be used when CUDNN is used")); "float16 can only be used when CUDNN is used"));
} }
#if PADDLE_WITH_CUDA
if (input_data_type == framework::proto::VarType::BF16 &&
library == framework::LibraryType::kCUDNN) {
PADDLE_ENFORCE_GE(
platform::CudnnVersion(), 8100,
platform::errors::InvalidArgument(
"bfloat16 can only be used when CUDNN_VERSION >= 8100"));
}
#endif // PADDLE_WITH_CUDA
auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library, customized_type_value); library, customized_type_value);
......
...@@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType( ...@@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType(
case framework::proto::VarType::FP64: case framework::proto::VarType::FP64:
type = CUDNN_DATA_DOUBLE; type = CUDNN_DATA_DOUBLE;
break; break;
#if CUDNN_VERSION_MIN(8, 1, 0)
case framework::proto::VarType::BF16:
type = CUDNN_DATA_BFLOAT16;
break;
#endif
default: default:
break; break;
} }
......
...@@ -102,6 +102,25 @@ inline ActivationMode StringToActivationMode(const std::string& str) { ...@@ -102,6 +102,25 @@ inline ActivationMode StringToActivationMode(const std::string& str) {
template <typename T> template <typename T>
class CudnnDataType; class CudnnDataType;
// CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1
#if CUDNN_VERSION_MIN(8, 1, 0)
template <>
class CudnnDataType<bfloat16> {
public:
static const cudnnDataType_t type = CUDNN_DATA_BFLOAT16;
using ScalingParamType = const float;
using BatchNormParamType = float;
static ScalingParamType* kOne() {
static ScalingParamType v = 1.0;
return &v;
}
static ScalingParamType* kZero() {
static ScalingParamType v = 0.0;
return &v;
}
};
#endif
template <> template <>
class CudnnDataType<float16> { class CudnnDataType<float16> {
public: public:
......
...@@ -32,7 +32,8 @@ class TestFusionLSTMBF16ONEDNNOp(OpTest): ...@@ -32,7 +32,8 @@ class TestFusionLSTMBF16ONEDNNOp(OpTest):
def test_check_output(self): def test_check_output(self):
for use_seq in {True, False}: for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq self.attrs['use_seq'] = use_seq
self.check_output(check_dygraph=False, no_check_set=["Cell"]) self.check_output(
check_dygraph=False, no_check_set=["Cell"], atol=2e-2)
def setUp(self): def setUp(self):
self.op_type = 'fusion_lstm' self.op_type = 'fusion_lstm'
......
...@@ -1191,7 +1191,9 @@ class OpTest(unittest.TestCase): ...@@ -1191,7 +1191,9 @@ class OpTest(unittest.TestCase):
np.float32, np.float64 np.float32, np.float64
]: ]:
actual_t = convert_uint16_to_float(actual_t) actual_t = convert_uint16_to_float(actual_t)
atol = max(atol, 0.03) rtol = 1.e-2
else:
rtol = 1.e-5
if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16: if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16:
expect_t = convert_uint16_to_float(expect_t) expect_t = convert_uint16_to_float(expect_t)
...@@ -1204,7 +1206,11 @@ class OpTest(unittest.TestCase): ...@@ -1204,7 +1206,11 @@ class OpTest(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t,
expect_t,
rtol=rtol,
atol=atol,
equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t) + " in class " + self.__class__.__name__) str(actual_t) + " in class " + self.__class__.__name__)
......
...@@ -20,7 +20,8 @@ import numpy as np ...@@ -20,7 +20,8 @@ import numpy as np
import paddle import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
from paddle.fluid.tests.unittests.testsuite import create_op
from paddle.fluid import Program, program_guard from paddle.fluid import Program, program_guard
...@@ -167,6 +168,52 @@ def create_test_cudnn_fp16_class(parent, grad_check=True): ...@@ -167,6 +168,52 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
globals()[cls_name] = TestConv2DCUDNNFp16 globals()[cls_name] = TestConv2DCUDNNFp16
def create_test_cudnn_bf16_class(parent):
@unittest.skipIf(
not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
"core is not compiled with CUDA and cudnn version need larger than 8.1.0"
)
class TestConv2DCUDNNBF16(parent):
def get_numeric_grad(self, place, check_name):
scope = core.Scope()
self._check_grad_helper()
op = create_op(scope, self.op_type, self.inputs, self.outputs,
self.attrs)
return get_numeric_gradient(place, scope, op, self.inputs_fp32,
check_name, ['Output'])
def init_kernel_type(self):
self.use_cudnn = True
self.no_need_check_grad = True
self.dtype = np.uint16
def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-2)
def test_check_grad_no_filter(self):
place = core.CUDAPlace(0)
numeric_grads = self.get_numeric_grad(place, 'Input')
self.check_grad_with_place(
place, ['Input'],
'Output',
no_grad_set=set(['Filter']),
user_defined_grads=[numeric_grads])
def test_check_grad_no_input(self):
place = core.CUDAPlace(0)
numeric_grads = self.get_numeric_grad(place, 'Filter')
self.check_grad_with_place(
place, ['Filter'],
'Output',
no_grad_set=set(['Input']),
user_defined_grads=[numeric_grads])
cls_name = "{0}_{1}".format(parent.__name__, "CUDNNBF16")
TestConv2DCUDNNBF16.__name__ = cls_name
globals()[cls_name] = TestConv2DCUDNNBF16
def create_test_channel_last_class(parent): def create_test_channel_last_class(parent):
class TestChannelLastCase(parent): class TestChannelLastCase(parent):
def init_data_format(self): def init_data_format(self):
...@@ -319,7 +366,15 @@ class TestConv2DOp(OpTest): ...@@ -319,7 +366,15 @@ class TestConv2DOp(OpTest):
'dilation': self.dilations 'dilation': self.dilations
} }
input = np.random.random(self.input_size).astype(self.dtype) if self.is_bfloat16_op():
input = np.random.random(self.input_size).astype(np.float32)
filter = np.random.uniform(-1, 1,
self.filter_size).astype(np.float32)
else:
input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.uniform(-1, 1,
self.filter_size).astype(self.dtype)
if not self.has_cuda(): if not self.has_cuda():
self.fuse_relu_before_depthwise_conv = False self.fuse_relu_before_depthwise_conv = False
if self.fuse_relu_before_depthwise_conv: if self.fuse_relu_before_depthwise_conv:
...@@ -329,16 +384,27 @@ class TestConv2DOp(OpTest): ...@@ -329,16 +384,27 @@ class TestConv2DOp(OpTest):
input2 = np.maximum(input, 0.0) input2 = np.maximum(input, 0.0)
else: else:
input2 = input input2 = input
filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups, output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
conv2d_param) conv2d_param)
output = output.astype(self.dtype)
self.inputs = { if self.is_bfloat16_op():
'Input': OpTest.np_dtype_to_fluid_dtype(input), output = output.astype(np.float32)
'Filter': OpTest.np_dtype_to_fluid_dtype(filter) self.inputs = {
} 'Input': convert_float_to_uint16(input),
'Filter': convert_float_to_uint16(filter)
}
self.inputs_fp32 = {
'Input': OpTest.np_dtype_to_fluid_dtype(input),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
else:
output = output.astype(self.dtype)
self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
self.attrs = { self.attrs = {
'strides': self.stride, 'strides': self.stride,
'paddings': self.pad, 'paddings': self.pad,
...@@ -554,6 +620,15 @@ create_test_cudnn_fp16_class(TestWithGroup, grad_check=False) ...@@ -554,6 +620,15 @@ create_test_cudnn_fp16_class(TestWithGroup, grad_check=False)
create_test_cudnn_fp16_class(TestWith1x1, grad_check=False) create_test_cudnn_fp16_class(TestWith1x1, grad_check=False)
create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False) create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False)
#----------------Conv2DCUDNN bf16----------------
create_test_cudnn_bf16_class(TestConv2DOp)
create_test_cudnn_bf16_class(TestWithPad)
create_test_cudnn_bf16_class(TestWithStride)
create_test_cudnn_bf16_class(TestWithGroup)
create_test_cudnn_bf16_class(TestWith1x1)
create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1)
class TestCUDNNExhaustiveSearch(TestConv2DOp): class TestCUDNNExhaustiveSearch(TestConv2DOp):
def init_kernel_type(self): def init_kernel_type(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册