diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index fe0150cca521976e1ff473ec3d83704657f16339..82c8aa50afc024de5e61944108f925b24f5830cf 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -210,16 +210,20 @@ struct SearchAlgorithm { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math"; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; - } else { + } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) { +#if CUDA_VERSION >= 11000 PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), - CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; + CUDNN_FMA_MATH)); +#endif // CUDA_VERSION >= 11000 } #endif @@ -340,16 +344,20 @@ struct SearchAlgorithm { algo_t algo; #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math"; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; - } else { + } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) { +#if CUDA_VERSION >= 11000 PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), - CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; + CUDNN_FMA_MATH)); +#endif // CUDA_VERSION >= 11000 } #endif @@ -485,16 +493,20 @@ struct SearchAlgorithm { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math"; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; - } else { + } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) { +#if CUDA_VERSION >= 11000 PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), - CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; + CUDNN_FMA_MATH)); +#endif // CUDA_VERSION >= 11000 } #endif diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 5f469e6a0f5276c0666750d6a8be2e25fc64f33c..5ef22b81869f6aa7dd2fdea19529033e457d082a 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -240,7 +240,8 @@ class CUDNNConvOpKernel : public framework::OpKernel { auto layout_format = GetCudnnTensorFormat(layout); args.handle = handle; - args.cdesc.set(dtype, padding_common, strides, dilations); + args.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn()); #if CUDNN_VERSION_MIN(7, 0, 1) // cudnn 7 can support groups, no need to do it manually @@ -603,7 +604,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { args1.idesc.set(transformed_input_grad, layout_tensor); args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); args1.odesc.set(transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + args1.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_groups); using search1 = SearchAlgorithm; data_algo = @@ -620,7 +622,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups); args2.odesc.set(transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + args2.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_groups); using search2 = SearchAlgorithm; filter_algo = @@ -980,7 +983,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { args1.idesc.set(transformed_ddX, iwo_group); args1.wdesc.set(*W, layout, iwo_group); args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args1.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); using search1 = SearchAlgorithm; fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); @@ -995,7 +999,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { args2.wdesc.set(*ddW, layout, iwo_group); args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args2.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); using search2 = SearchAlgorithm; fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); @@ -1012,7 +1017,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args3.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); using search3 = SearchAlgorithm; filter_algo = @@ -1028,7 +1034,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { args4.idesc.set(transformed_dX, iwo_group); args4.wdesc.set(*ddW, layout, iwo_group); args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args4.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); using search4 = SearchAlgorithm; data_algo = diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 94148109c7369fa15572e3e9d27912c82cdb150e..a12629b7a4959179d176f24c26a42af7aad3a277 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -232,7 +232,8 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { args.idesc.set(transformed_output, iwo_groups); args.wdesc.set(*filter, layout_tensor, iwo_groups); args.odesc.set(transformed_input, iwo_groups); - args.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + args.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_groups); using search = SearchAlgorithm; algo = search::Find(args, false, deterministic, ctx); @@ -468,7 +469,8 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { args1.idesc.set(transformed_output_grad, iwo_groups); args1.wdesc.set(*filter, layout_tensor, iwo_groups); args1.odesc.set(input_transpose, iwo_groups); - args1.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + args1.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_groups); using search1 = SearchAlgorithm; data_algo = search1::Find(args1, false, deterministic, ctx); workspace_size = @@ -481,7 +483,8 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { args2.idesc.set(transformed_output_grad, iwo_groups); args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups); args2.odesc.set(input_transpose, iwo_groups); - args2.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + args2.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_groups); using search2 = SearchAlgorithm; filter_algo = search2::Find(args2, false, deterministic, ctx); workspace_size = std::max(workspace_size, diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 49fded886a0339a0456ee55d0d4d1249461f93b9..33d408582ff48504ed7fce2950934fcd43cabc90 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -200,6 +200,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); +#if CUDNN_VERSION >= 11000 + if (!platform::allow_tf32_cudnn) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc, + CUDNN_FMA_MATH)); + } +#endif // CUDA_VERSION >= 11000 auto x_dims = framework::vectorize(transformed_input.dims()); auto f_dims = framework::vectorize(filter->dims()); diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 3529ff1f94aab259661640925f5096890dd95566..c448c529f569158835020eec78d9092845247cdc 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -153,6 +153,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(conv_desc[i], CUDNN_DEFAULT_MATH)); +#if CUDNN_VERSION >= 11000 + if (!platform::allow_tf32_cudnn) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(conv_desc[i], + CUDNN_FMA_MATH)); + } +#endif // CUDA_VERSION >= 11000 } in_dims[2][1] *= 2; in_strides[2][0] = oc * h * w; diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h index 0e0218dcca3fc4d7ea661fbcfe89d260a4c93a2d..05a431e731e32c2b36f0aebfa11cb95f2607929c 100644 --- a/paddle/fluid/platform/cudnn_desc.h +++ b/paddle/fluid/platform/cudnn_desc.h @@ -24,6 +24,7 @@ #include #include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace framework { @@ -229,7 +230,8 @@ class ConvolutionDescriptor { void set(cudnnDataType_t dtype, const std::vector& pads, const std::vector& strides, const std::vector& dilations, - const int groups = 1) { + bool allow_tf32, const int groups = 1) { + allow_tf32_ = allow_tf32; cudnnDataType_t compute_type = (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; T* desc = desc_.get(); @@ -246,11 +248,18 @@ class ConvolutionDescriptor { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(desc, CUDNN_TENSOR_OP_MATH)); + } else if (dtype == CUDNN_DATA_FLOAT && !allow_tf32) { +#if CUDA_VERSION >= 11000 + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(desc, CUDNN_FMA_MATH)); +#endif // CUDA_VERSION >= 11000 } #endif #endif } + bool allow_tf32_; + private: std::unique_ptr desc_; }; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 8aa67c877ab58cf2f01b34b792eeb905b8995cd2..57c5ccefaee855eb609f01ef0ac861be2cfe2f18 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -74,6 +74,10 @@ namespace platform { bool allow_tf32_cublas = true; void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } bool AllowTF32Cublas() { return allow_tf32_cublas; } + +bool allow_tf32_cudnn = true; +void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; } +bool AllowTF32Cudnn() { return allow_tf32_cudnn; } #endif // PADDLE_WITH_CUDA DeviceContextPool* DeviceContextPool::pool = nullptr; diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 4e79e645aaae12c563f5ceb82fdd85ec6416aac5..f058da97b5cfa2358873dea6e3efec997fb40dff 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -67,6 +67,10 @@ namespace platform { void SetAllowTF32Cublas(bool active); /*Get the global variable allow_tf32_cublas value*/ bool AllowTF32Cublas(); +/*Set the value of the global variable allow_tf32_cudnn*/ +void SetAllowTF32Cudnn(bool active); +/*Get the global variable allow_tf32_cudnn value*/ +bool AllowTF32Cudnn(); #endif // PADDLE_WITH_CUDA enum DeviceType { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58145f72487e3fe26c75a43fed842fd54adccfce..5f4c5fd2c30a453eca30d431d4a3fcdcf70da5b4 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1988,6 +1988,8 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_CUDA m.def("set_cublas_switch", platform::SetAllowTF32Cublas); m.def("get_cublas_switch", platform::AllowTF32Cublas); + m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn); + m.def("get_cudnn_switch", platform::AllowTF32Cudnn); #endif // PADDLE_WITH_CUDA using VarQuantScale = diff --git a/python/paddle/fluid/tests/unittests/test_tf32_cudnn.py b/python/paddle/fluid/tests/unittests/test_tf32_cudnn.py new file mode 100644 index 0000000000000000000000000000000000000000..48127c2a90b49f66b60ba5830ac858c73b218cd8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tf32_cudnn.py @@ -0,0 +1,38 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import six +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + + +class TestTF32Switch(unittest.TestCase): + def test_on_off(self): + if core.is_compiled_with_cuda(): + self.assertTrue(core.get_cudnn_switch()) # default + core.set_cudnn_switch(0) + self.assertFalse(core.get_cudnn_switch()) # turn off + core.set_cudnn_switch(1) + self.assertTrue(core.get_cudnn_switch()) # turn on + + core.set_cudnn_switch(1) # restore the switch + else: + pass + + +if __name__ == '__main__': + unittest.main()