From 187ba08789619a5a92b67d457dbed165c1b086af Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 3 Apr 2018 17:50:55 -0700
Subject: [PATCH] enable tensor core for conv cudnn

---
 paddle/fluid/operators/conv_cudnn_op.cu.cc | 22 ++++++++++++++++++++++
 paddle/fluid/platform/cudnn_helper.h       |  4 +++-
 paddle/fluid/platform/dynload/cudnn.h      |  4 ++--
 3 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index a32aba4c1f..c70e3cc3c9 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    }
+#endif
+
     // get workspace size able to allocate
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // It is possible for float16 on Volta GPU to allocate more memory than
+    // the limit because the algo is overrided to use tensor core.
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
+
     // Allocate on GPU memory
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 7c604e14eb..c0d399d078 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {
     }
 #endif
 
+    cudnnDataType_t compute_type =
+        (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, type));
+        CUDNN_CROSS_CORRELATION, compute_type));
     return desc_;
   }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 81acc445bd..3f91bd2fe4 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <cudnn.h>
 #include <dlfcn.h>
-#include <mutex>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -140,7 +139,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
-  __macro(cudnnSetConvolutionGroupCount);
+  __macro(cudnnSetConvolutionGroupCount);  \
+  __macro(cudnnSetConvolutionMathType);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-- 
GitLab