From f36a9464bf19b07eb613471b4f39c7c1c756fe47 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 20 May 2022 20:02:11 +0800 Subject: [PATCH] use fp32 compute type for cublasGemmStridedBatchedEx with fp16 input/output (#42851) * use fp32 compute type for cublasGemmStridedBatchedEx with fp16 input/output * add flags to control compute type * default to false * add unit test * default to true --- paddle/fluid/platform/flags.cc | 15 ++++++ paddle/phi/kernels/funcs/blas/blas_impl.cu.h | 46 +++++++++++++--- .../tests/unittests/test_matmul_v2_op.py | 52 +++++++++++++++++++ 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 054a804e6b3..600a4cbcc3e 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -88,6 +88,21 @@ PADDLE_DEFINE_EXPORTED_bool( "input and output must be half precision) and recurrent neural networks " "(RNNs)."); +/** + * CUDA related related FLAG + * Name: FLAGS_gemm_use_half_precision_compute_type + * Since Version: 2.4 + * Value Range: bool, default=true + * Example: + * Note: whether to use fp16 compute type when the input and output is fp16, + * faster but it may loss precision. + */ +PADDLE_DEFINE_EXPORTED_bool( + gemm_use_half_precision_compute_type, true, + "Whether to use fp16 compute type when the input and output is fp16, " + "faster but it may loss precision in most case. If true, the compute " + "type will be set to fp32. Default is true."); + /** * CUDA related FLAG * Name: FLAGS_selected_gpus diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index cd7a24c6d24..e2b16a1eb7f 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -21,6 +21,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" DECLARE_bool(enable_cublas_tensor_op_math); +DECLARE_bool(gemm_use_half_precision_compute_type); namespace phi { namespace funcs { @@ -2255,8 +2256,25 @@ void Blas::BatchedGEMM( } VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + VLOG(4) << "use_half_precision_compute_type: " + << FLAGS_gemm_use_half_precision_compute_type; auto fp = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; + cudaDataType_t compute_type = fp; + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + void *a = static_cast(&h_alpha); + void *b = static_cast(&h_beta); + // set ComputeType as CUDA_R_32F for fp16, for better accuracy + if (FLAGS_gemm_use_half_precision_compute_type == true && + std::is_same::value) { + a = static_cast(&alpha); + b = static_cast(&beta); + compute_type = CUDA_R_16F; + } + + // set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cublasGemmStridedBatchedEx(handle, @@ -2265,7 +2283,7 @@ void Blas::BatchedGEMM( N, M, K, - &alpha, + a, B, fp, ldb, @@ -2274,13 +2292,13 @@ void Blas::BatchedGEMM( fp, lda, strideA, - &beta, + b, C, fp, ldc, strideC, batchCount, - fp, + compute_type, algo)); }); } else { @@ -2348,8 +2366,24 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + VLOG(4) << "use_half_precision_compute_type: " + << FLAGS_gemm_use_half_precision_compute_type; auto fp = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; + cudaDataType_t compute_type = CUDA_R_32F; + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + void *a = static_cast(&h_alpha); + void *b = static_cast(&h_beta); + // set ComputeType as CUDA_R_32F for fp16, for better accuracy + if (FLAGS_gemm_use_half_precision_compute_type == true && + std::is_same::value) { + a = static_cast(&alpha); + b = static_cast(&beta); + compute_type = CUDA_R_16F; + } + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cublasGemmStridedBatchedEx(handle, @@ -2358,7 +2392,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, N, M, K, - &alpha, + a, B, fp, ldb, @@ -2367,13 +2401,13 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, fp, lda, strideA, - &beta, + b, C, fp, ldc, strideC, batchCount, - fp, + compute_type, algo)); }); } else { diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index 3e06b69278d..f6f62045b19 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -495,6 +495,58 @@ class TestMatMulV2API(unittest.TestCase): y = paddle.to_tensor(input_y) result = paddle.matmul(x, y) + def test_compute_type_fp32(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + with fluid.dygraph.guard(place): + paddle.set_flags({ + 'FLAGS_gemm_use_half_precision_compute_type': False + }) + input_x = np.random.random([2, 8, 16]).astype("float16") + input_y = np.random.random([2, 16, 8]).astype("float16") + for i in range(0, 16, 2): + input_x[:, :, i] += 60000 + input_x[:, :, i + 1] -= 60000 + input_y[:, :, :] = 1.5 + + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) + result_np = np.matmul(input_x, input_y) + self.assertTrue(paddle.isfinite(result)[0, 0, 0]) + self.assertTrue(np.isfinite(result_np)[0, 0, 0]) + self.assertTrue(np.array_equal(result_np, result.numpy())) + paddle.set_flags({ + 'FLAGS_gemm_use_half_precision_compute_type': True + }) + + def test_compute_type_fp16_nan(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + with fluid.dygraph.guard(place): + paddle.set_flags({ + 'FLAGS_gemm_use_half_precision_compute_type': True + }) + input_x = np.random.random([2, 8, 16]).astype("float16") + input_y = np.random.random([2, 16, 8]).astype("float16") + for i in range(0, 16, 2): + input_x[:, :, i] += 60000 + input_x[:, :, i + 1] -= 60000 + input_y[:, :, :] = 1.5 + + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) + result_np = np.matmul(input_x, input_y) + self.assertFalse( + paddle.isfinite(result)[0, 0, 0]) # contains nan/inf + self.assertTrue(np.isfinite(result_np)[0, 0, 0]) + paddle.set_flags({ + 'FLAGS_gemm_use_half_precision_compute_type': False + }) + def test_api_eager_dygraph(self): with _test_eager_guard(): self.test_dygraph() -- GitLab