From 5981bee2af22e58b48fafb2d69b1c4e243bb1227 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Wed, 2 Jun 2021 10:30:58 +0800 Subject: [PATCH] conv2d support bfloat16 (#32221) --- paddle/fluid/operators/conv_cudnn_helper.h | 19 +++- paddle/fluid/operators/conv_cudnn_op.cu | 26 ++++++ paddle/fluid/operators/conv_op.cc | 9 ++ paddle/fluid/platform/cudnn_desc.h | 5 + paddle/fluid/platform/cudnn_helper.h | 19 ++++ .../mkldnn/test_fusion_lstm_bf16_mkldnn_op.py | 3 +- .../paddle/fluid/tests/unittests/op_test.py | 10 +- .../fluid/tests/unittests/test_conv2d_op.py | 91 +++++++++++++++++-- 8 files changed, 167 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index c7eac903a8c..c6cd45dc18b 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -211,20 +211,31 @@ struct SearchAlgorithm { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; - } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) { #if CUDA_VERSION >= 11000 +#if CUDNN_VERSION_MIN(8, 1, 0) + } else if (dev_ctx.GetComputeCapability() >= 80 && + dtype == CUDNN_DATA_BFLOAT16) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "use cudnn_tensor_op_math"; +#endif // CUDNN_VERSION >= 8100 + } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), CUDNN_FMA_MATH)); + VLOG(5) << "use cudnn_fma_math"; #endif // CUDA_VERSION >= 11000 + } else { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_DEFAULT_MATH)); + VLOG(5) << "use cudnn_default_math"; } #endif diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 7fdb1ccfe96..c49a3ee1c20 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -1413,6 +1413,31 @@ REGISTER_OP_KERNEL( paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); #else +#if CUDNN_VERSION_MIN(8, 1, 0) +REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); +REGISTER_OP_KERNEL( + conv2d_grad_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel); + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d_grad_grad, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel); +#else REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel, paddle::operators::CUDNNConvOpKernel, @@ -1432,6 +1457,7 @@ REGISTER_OP_CUDA_KERNEL( paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); +#endif REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel, diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 17ce109610b..1266cfe6081 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -199,6 +199,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( platform::errors::InvalidArgument( "float16 can only be used when CUDNN is used")); } +#if PADDLE_WITH_CUDA + if (input_data_type == framework::proto::VarType::BF16 && + library == framework::LibraryType::kCUDNN) { + PADDLE_ENFORCE_GE( + platform::CudnnVersion(), 8100, + platform::errors::InvalidArgument( + "bfloat16 can only be used when CUDNN_VERSION >= 8100")); + } +#endif // PADDLE_WITH_CUDA auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library, customized_type_value); diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h index 05a431e731e..8e969588afb 100644 --- a/paddle/fluid/platform/cudnn_desc.h +++ b/paddle/fluid/platform/cudnn_desc.h @@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType( case framework::proto::VarType::FP64: type = CUDNN_DATA_DOUBLE; break; +#if CUDNN_VERSION_MIN(8, 1, 0) + case framework::proto::VarType::BF16: + type = CUDNN_DATA_BFLOAT16; + break; +#endif default: break; } diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 0d2a770ad82..65dd69a37d3 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -102,6 +102,25 @@ inline ActivationMode StringToActivationMode(const std::string& str) { template class CudnnDataType; +// CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1 +#if CUDNN_VERSION_MIN(8, 1, 0) +template <> +class CudnnDataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_BFLOAT16; + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; +#endif + template <> class CudnnDataType { public: diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py index 46bdbb1a420..d65919aa434 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py @@ -32,7 +32,8 @@ class TestFusionLSTMBF16ONEDNNOp(OpTest): def test_check_output(self): for use_seq in {True, False}: self.attrs['use_seq'] = use_seq - self.check_output(check_dygraph=False, no_check_set=["Cell"]) + self.check_output( + check_dygraph=False, no_check_set=["Cell"], atol=2e-2) def setUp(self): self.op_type = 'fusion_lstm' diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 654723d8629..9bf4d09cc36 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1191,7 +1191,9 @@ class OpTest(unittest.TestCase): np.float32, np.float64 ]: actual_t = convert_uint16_to_float(actual_t) - atol = max(atol, 0.03) + rtol = 1.e-2 + else: + rtol = 1.e-5 if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16: expect_t = convert_uint16_to_float(expect_t) @@ -1204,7 +1206,11 @@ class OpTest(unittest.TestCase): self.assertTrue( np.allclose( - actual_t, expect_t, atol=atol, equal_nan=equal_nan), + actual_t, + expect_t, + rtol=rtol, + atol=atol, + equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t) + " in class " + self.__class__.__name__) diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index e55997c229e..db05801c722 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -20,7 +20,8 @@ import numpy as np import paddle import paddle.fluid.core as core import paddle.fluid as fluid -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient +from paddle.fluid.tests.unittests.testsuite import create_op from paddle.fluid import Program, program_guard @@ -167,6 +168,52 @@ def create_test_cudnn_fp16_class(parent, grad_check=True): globals()[cls_name] = TestConv2DCUDNNFp16 +def create_test_cudnn_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0" + ) + class TestConv2DCUDNNBF16(parent): + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op(scope, self.op_type, self.inputs, self.outputs, + self.attrs) + return get_numeric_gradient(place, scope, op, self.inputs_fp32, + check_name, ['Output']) + + def init_kernel_type(self): + self.use_cudnn = True + self.no_need_check_grad = True + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-2) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Input') + self.check_grad_with_place( + place, ['Input'], + 'Output', + no_grad_set=set(['Filter']), + user_defined_grads=[numeric_grads]) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Filter') + self.check_grad_with_place( + place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + user_defined_grads=[numeric_grads]) + + cls_name = "{0}_{1}".format(parent.__name__, "CUDNNBF16") + TestConv2DCUDNNBF16.__name__ = cls_name + globals()[cls_name] = TestConv2DCUDNNBF16 + + def create_test_channel_last_class(parent): class TestChannelLastCase(parent): def init_data_format(self): @@ -319,7 +366,15 @@ class TestConv2DOp(OpTest): 'dilation': self.dilations } - input = np.random.random(self.input_size).astype(self.dtype) + if self.is_bfloat16_op(): + input = np.random.random(self.input_size).astype(np.float32) + filter = np.random.uniform(-1, 1, + self.filter_size).astype(np.float32) + else: + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.uniform(-1, 1, + self.filter_size).astype(self.dtype) + if not self.has_cuda(): self.fuse_relu_before_depthwise_conv = False if self.fuse_relu_before_depthwise_conv: @@ -329,16 +384,27 @@ class TestConv2DOp(OpTest): input2 = np.maximum(input, 0.0) else: input2 = input - filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups, conv2d_param) - output = output.astype(self.dtype) - self.inputs = { - 'Input': OpTest.np_dtype_to_fluid_dtype(input), - 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) - } + if self.is_bfloat16_op(): + output = output.astype(np.float32) + self.inputs = { + 'Input': convert_float_to_uint16(input), + 'Filter': convert_float_to_uint16(filter) + } + self.inputs_fp32 = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + else: + output = output.astype(self.dtype) + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { 'strides': self.stride, 'paddings': self.pad, @@ -554,6 +620,15 @@ create_test_cudnn_fp16_class(TestWithGroup, grad_check=False) create_test_cudnn_fp16_class(TestWith1x1, grad_check=False) create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False) +#----------------Conv2DCUDNN bf16---------------- + +create_test_cudnn_bf16_class(TestConv2DOp) +create_test_cudnn_bf16_class(TestWithPad) +create_test_cudnn_bf16_class(TestWithStride) +create_test_cudnn_bf16_class(TestWithGroup) +create_test_cudnn_bf16_class(TestWith1x1) +create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1) + class TestCUDNNExhaustiveSearch(TestConv2DOp): def init_kernel_type(self): -- GitLab