From 187ba08789619a5a92b67d457dbed165c1b086af Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 3 Apr 2018 17:50:55 -0700 Subject: [PATCH] enable tensor core for conv cudnn --- paddle/fluid/operators/conv_cudnn_op.cu.cc | 22 ++++++++++++++++++++++ paddle/fluid/platform/cudnn_helper.h | 4 +++- paddle/fluid/platform/dynload/cudnn.h | 4 ++-- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index a32aba4c1f..c70e3cc3c9 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel { handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &algo)); + +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + // Tensor core is supported since the volta GPU and + // is only enabled when input and filter data are float16 + if (dev_ctx.GetComputeCapability() >= 70 && + std::type_index(typeid(T)) == + std::type_index(typeid(platform::float16))) { + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); + // Currently tensor core is only enabled using this algo + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } else { + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_DEFAULT_MATH)); + } +#endif + // get workspace size able to allocate PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); + // It is possible for float16 on Volta GPU to allocate more memory than + // the limit because the algo is overrided to use tensor core. + PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, + "workspace_size to be allocated exceeds the limit"); + // Allocate on GPU memory platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 7c604e14eb..c0d399d078 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor { } #endif + cudnnDataType_t compute_type = + (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( desc_, pads.size(), pads.data(), strides.data(), dilations.data(), - CUDNN_CROSS_CORRELATION, type)); + CUDNN_CROSS_CORRELATION, compute_type)); return desc_; } diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 81acc445bd..3f91bd2fe4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include -#include #include "paddle/fluid/platform/dynload/dynamic_loader.h" namespace paddle { @@ -140,7 +139,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #if CUDNN_VERSION >= 7001 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ - __macro(cudnnSetConvolutionGroupCount); + __macro(cudnnSetConvolutionGroupCount); \ + __macro(cudnnSetConvolutionMathType); CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif -- GitLab