diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index 6b3f91d52057ed804a61d1e72867bc30c19afbd9..d6da830c9c4c7b53cba0d9054ef1a93a73b8f9f5 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -84,8 +84,13 @@ class CublasHandleHolder { if (math_type == CUBLAS_TENSOR_OP_MATH) { PADDLE_ENFORCE_CUDA_SUCCESS( dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); +#if CUDA_VERSION >= 11000 + } else if (math_type == CUBLAS_TF32_TENSOR_OP_MATH) { + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cublasSetMathMode(handle_, CUBLAS_TF32_TENSOR_OP_MATH)); +#endif // CUDA_VERSION >= 11000 } -#endif +#endif // CUDA_VERSION >= 9000 } ~CublasHandleHolder() PADDLE_MAY_THROW { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 297466e8e5a624359406c5551941ceaa73e5c5c5..beb1db93f483e9b224acf6d40c6df9fb3c4ece9c 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -54,6 +54,12 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { namespace paddle { namespace platform { +#ifdef PADDLE_WITH_CUDA +bool allow_tf32_cublas = true; +void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } +bool AllowTF32Cublas() { return allow_tf32_cublas; } +#endif // PADDLE_WITH_CUDA + DeviceContextPool* DeviceContextPool::pool = nullptr; platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 56438a95f2a8907bfb13bd192a9eb30e5082b4be..f0ce89aa5efd86b5f6b11a04388acb8d4166e302 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -57,6 +57,13 @@ struct GpuDevice; namespace paddle { namespace platform { +#ifdef PADDLE_WITH_CUDA +/*Set the value of the global variable allow_tf32_cublas*/ +void SetAllowTF32Cublas(bool active); +/*Get the global variable allow_tf32_cublas value*/ +bool AllowTF32Cublas(); +#endif // PADDLE_WITH_CUDA + class DeviceContext { public: virtual ~DeviceContext() PADDLE_MAY_THROW {} @@ -161,7 +168,11 @@ class CUDAContext { /*! \brief Call cublas function safely. */ template inline void CublasCall(Callback&& callback) const { - cublas_handle_->Call(std::forward(callback)); + if (cublas_tf32_tensor_core_handle_) { + cublas_tf32_tensor_core_handle_->Call(std::forward(callback)); + } else { + cublas_handle_->Call(std::forward(callback)); + } } /*! \brief Check whether tensor core is supported */ @@ -188,7 +199,11 @@ class CUDAContext { #if CUDA_VERSION >= 9000 cublas_tensor_core_handle_.reset( new CublasHandleHolder(RawStream(), CUBLAS_TENSOR_OP_MATH)); -#endif +#if CUDA_VERSION >= 11000 + cublas_tf32_tensor_core_handle_.reset( + new CublasHandleHolder(RawStream(), CUBLAS_TF32_TENSOR_OP_MATH)); +#endif // CUDA_VERSION >= 11000 +#endif // CUDA_VERSION >= 9000 } } @@ -231,6 +246,7 @@ class CUDAContext { void DestoryCuBlasContext() { cublas_handle_.reset(); cublas_tensor_core_handle_.reset(); + cublas_tf32_tensor_core_handle_.reset(); } void DestoryCuSolverContext() { @@ -247,6 +263,7 @@ class CUDAContext { cudnnHandle_t cudnn_handle_; std::unique_ptr cublas_handle_; std::unique_ptr cublas_tensor_core_handle_; + std::unique_ptr cublas_tf32_tensor_core_handle_; cusolverDnHandle_t cusolver_dn_handle_; DISABLE_COPY_AND_ASSIGN(CUDAContext); }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 9930acff00ad72b8aa19aa30701fdb6d44cfd60b..44b5614b9a1a119f885d0064c6c6413214239b8e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -58,6 +58,7 @@ limitations under the License. */ #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" @@ -1980,6 +1981,11 @@ All parameter, weight, gradient are variables in Paddle. m.def("size_of_dtype", framework::SizeOfType); +#ifdef PADDLE_WITH_CUDA + m.def("set_cublas_switch", platform::SetAllowTF32Cublas); + m.def("get_cublas_switch", platform::AllowTF32Cublas); +#endif // PADDLE_WITH_CUDA + using VarQuantScale = std::unordered_map>; diff --git a/python/paddle/fluid/tests/unittests/test_tf32_cublas.py b/python/paddle/fluid/tests/unittests/test_tf32_cublas.py new file mode 100644 index 0000000000000000000000000000000000000000..32d8c3dc322e49102150985dcaaeb27d05ea1a44 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tf32_cublas.py @@ -0,0 +1,57 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import six +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + + +class TestTF32Switch(unittest.TestCase): + def test_on_off(self): + if core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + self.assertTrue(core.get_cublas_switch()) # default + core.set_cublas_switch(False) + self.assertFalse(core.get_cublas_switch()) # turn off + core.set_cublas_switch(True) + self.assertTrue(core.get_cublas_switch()) # turn on + + core.set_cublas_switch(True) # restore the switch + else: + pass + + +class TestTF32OnMatmul(unittest.TestCase): + def test_dygraph_without_out(self): + if core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + core.set_cublas_switch(False) # turn off + with fluid.dygraph.guard(place): + input_array1 = np.random.rand(4, 12, 64, 88).astype("float32") + input_array2 = np.random.rand(4, 12, 88, 512).astype("float32") + data1 = paddle.to_tensor(input_array1) + data2 = paddle.to_tensor(input_array2) + out = paddle.matmul(data1, data2) + expected_result = np.matmul(input_array1, input_array2) + self.assertTrue(np.allclose(expected_result, out.numpy(), 1e-03)) + core.set_cublas_switch(True) # restore the switch + else: + pass + + +if __name__ == '__main__': + unittest.main()