diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index a32aba4c1ff2f5e775aeb41f25b02322dbc6a64a..c70e3cc3c9198008d9eca5f462000aa67ff7e5ba 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel { handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &algo)); + +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + // Tensor core is supported since the volta GPU and + // is only enabled when input and filter data are float16 + if (dev_ctx.GetComputeCapability() >= 70 && + std::type_index(typeid(T)) == + std::type_index(typeid(platform::float16))) { + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); + // Currently tensor core is only enabled using this algo + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } else { + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_DEFAULT_MATH)); + } +#endif + // get workspace size able to allocate PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); + // It is possible for float16 on Volta GPU to allocate more memory than + // the limit because the algo is overrided to use tensor core. + PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, + "workspace_size to be allocated exceeds the limit"); + // Allocate on GPU memory platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 7c604e14eb245232ed92f53a00b9bde45c2fbaec..c0d399d078f73743836fc2a0c1d4b1e6b31ecd83 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor { } #endif + cudnnDataType_t compute_type = + (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( desc_, pads.size(), pads.data(), strides.data(), dilations.data(), - CUDNN_CROSS_CORRELATION, type)); + CUDNN_CROSS_CORRELATION, compute_type)); return desc_; } diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 81acc445bd3803dede158ff09507a72fb6e293ac..49a54d8478e9a4e507d31a67b924802def356bfa 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include +#include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" namespace paddle { @@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #if CUDNN_VERSION >= 7001 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ - __macro(cudnnSetConvolutionGroupCount); + __macro(cudnnSetConvolutionGroupCount); \ + __macro(cudnnSetConvolutionMathType); CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif