Add tf32 support for A100 tensor core acceleration for cuBLAS (#28732)

efea540c · AshburnLee · GitHub · 7779768b · efea540c · efea540c
5 changed file
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -84,8 +84,13 @@ class CublasHandleHolder {
    if (math_type == CUBLAS_TENSOR_OP_MATH) {
      PADDLE_ENFORCE_CUDA_SUCCESS(
          dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
+#if CUDA_VERSION >= 11000
+    } else if (math_type == CUBLAS_TF32_TENSOR_OP_MATH) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cublasSetMathMode(handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif  // CUDA_VERSION >= 11000
    }
-#endif
+#endif  // CUDA_VERSION >= 9000
  }
  ~CublasHandleHolder() PADDLE_MAY_THROW {

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -54,6 +54,12 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
 namespace paddle {
 namespace platform {
+#ifdef PADDLE_WITH_CUDA
+bool allow_tf32_cublas = true;
+void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+#endif  // PADDLE_WITH_CUDA
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -57,6 +57,13 @@ struct GpuDevice;
 namespace paddle {
 namespace platform {
+#ifdef PADDLE_WITH_CUDA
+/*Set the value of the global variable allow_tf32_cublas*/
+void SetAllowTF32Cublas(bool active);
+/*Get the global variable allow_tf32_cublas value*/
+bool AllowTF32Cublas();
+#endif  // PADDLE_WITH_CUDA
 class DeviceContext {
 public:
  virtual ~DeviceContext() PADDLE_MAY_THROW {}
@@ -161,7 +168,11 @@ class CUDAContext {
  /*! \brief  Call cublas function safely. */
  template <typename Callback>
  inline void CublasCall(Callback&& callback) const {
-    cublas_handle_->Call(std::forward<Callback>(callback));
+    if (cublas_tf32_tensor_core_handle_) {
+      cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
+    } else {
+      cublas_handle_->Call(std::forward<Callback>(callback));
+    }
  }
  /*! \brief  Check whether tensor core is supported */
@@ -188,7 +199,11 @@ class CUDAContext {
 #if CUDA_VERSION >= 9000
      cublas_tensor_core_handle_.reset(
          new CublasHandleHolder(RawStream(), CUBLAS_TENSOR_OP_MATH));
-#endif
+#if CUDA_VERSION >= 11000
+      cublas_tf32_tensor_core_handle_.reset(
+          new CublasHandleHolder(RawStream(), CUBLAS_TF32_TENSOR_OP_MATH));
+#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 9000
    }
  }
@@ -231,6 +246,7 @@ class CUDAContext {
  void DestoryCuBlasContext() {
    cublas_handle_.reset();
    cublas_tensor_core_handle_.reset();
+    cublas_tf32_tensor_core_handle_.reset();
  }
  void DestoryCuSolverContext() {
@@ -247,6 +263,7 @@ class CUDAContext {
  cudnnHandle_t cudnn_handle_;
  std::unique_ptr<CublasHandleHolder> cublas_handle_;
  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
+  std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
  cusolverDnHandle_t cusolver_dn_handle_;
  DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -58,6 +58,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
@@ -1980,6 +1981,11 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("size_of_dtype", framework::SizeOfType);
+#ifdef PADDLE_WITH_CUDA
+  m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
+  m.def("get_cublas_switch", platform::AllowTF32Cublas);
+#endif  // PADDLE_WITH_CUDA
  using VarQuantScale =
      std::unordered_map<std::string, std::pair<bool, LoDTensor>>;

--- a/python/paddle/fluid/tests/unittests/test_tf32_cublas.py
+++ b/python/paddle/fluid/tests/unittests/test_tf32_cublas.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import six
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+class TestTF32Switch(unittest.TestCase):
+    def test_on_off(self):
+        if core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self.assertTrue(core.get_cublas_switch())  # default
+            core.set_cublas_switch(False)
+            self.assertFalse(core.get_cublas_switch())  # turn off
+            core.set_cublas_switch(True)
+            self.assertTrue(core.get_cublas_switch())  # turn on
+            core.set_cublas_switch(True)  # restore the switch
+        else:
+            pass
+class TestTF32OnMatmul(unittest.TestCase):
+    def test_dygraph_without_out(self):
+        if core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            core.set_cublas_switch(False)  # turn off
+            with fluid.dygraph.guard(place):
+                input_array1 = np.random.rand(4, 12, 64, 88).astype("float32")
+                input_array2 = np.random.rand(4, 12, 88, 512).astype("float32")
+                data1 = paddle.to_tensor(input_array1)
+                data2 = paddle.to_tensor(input_array2)
+                out = paddle.matmul(data1, data2)
+                expected_result = np.matmul(input_array1, input_array2)
+            self.assertTrue(np.allclose(expected_result, out.numpy(), 1e-03))
+            core.set_cublas_switch(True)  # restore the switch
+        else:
+            pass
+if __name__ == '__main__':
+    unittest.main()