diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 054a804e6b38e840ac0c9890a1c6f5ebcdb19341..600a4cbcc3ed9969e021beaa2bda1ff23c89bb3b 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -88,6 +88,21 @@ PADDLE_DEFINE_EXPORTED_bool(
     "input and output must be half precision) and recurrent neural networks "
     "(RNNs).");
 
+/**
+ * CUDA related related FLAG
+ * Name: FLAGS_gemm_use_half_precision_compute_type
+ * Since Version: 2.4
+ * Value Range: bool, default=true
+ * Example:
+ * Note: whether to use fp16 compute type when the input and output is fp16,
+ * faster but it may loss precision.
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    gemm_use_half_precision_compute_type, true,
+    "Whether to use fp16 compute type when the input and output is fp16, "
+    "faster but it may loss precision in most case. If true, the compute "
+    "type will be set to fp32. Default is true.");
+
 /**
  * CUDA related FLAG
  * Name: FLAGS_selected_gpus
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index cd7a24c6d242160ad0ac220c60df5b5bc12caf5a..e2b16a1eb7ff115d416aec351035d4288baf4bf7 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -21,6 +21,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_bool(gemm_use_half_precision_compute_type);
 
 namespace phi {
 namespace funcs {
@@ -2255,8 +2256,25 @@ void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
     }
     VLOG(5) << "use_tensor_op_math: "
             << (use_tensor_op_math ? "True" : "False");
+    VLOG(4) << "use_half_precision_compute_type: "
+            << FLAGS_gemm_use_half_precision_compute_type;
 
     auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t compute_type = fp;
+
+    float h_alpha = static_cast<float>(alpha);
+    float h_beta = static_cast<float>(beta);
+    void *a = static_cast<void *>(&h_alpha);
+    void *b = static_cast<void *>(&h_beta);
+    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
+    if (FLAGS_gemm_use_half_precision_compute_type == true &&
+        std::is_same<T, phi::dtype::float16>::value) {
+      a = static_cast<void *>(&alpha);
+      b = static_cast<void *>(&beta);
+      compute_type = CUDA_R_16F;
+    }
+
+    // set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2265,7 +2283,7 @@ void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
                                                                 N,
                                                                 M,
                                                                 K,
-                                                                &alpha,
+                                                                a,
                                                                 B,
                                                                 fp,
                                                                 ldb,
@@ -2274,13 +2292,13 @@ void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
                                                                 fp,
                                                                 lda,
                                                                 strideA,
-                                                                &beta,
+                                                                b,
                                                                 C,
                                                                 fp,
                                                                 ldc,
                                                                 strideC,
                                                                 batchCount,
-                                                                fp,
+                                                                compute_type,
                                                                 algo));
     });
   } else {
@@ -2348,8 +2366,24 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
     }
     VLOG(5) << "use_tensor_op_math: "
             << (use_tensor_op_math ? "True" : "False");
+    VLOG(4) << "use_half_precision_compute_type: "
+            << FLAGS_gemm_use_half_precision_compute_type;
 
     auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t compute_type = CUDA_R_32F;
+
+    float h_alpha = static_cast<float>(alpha);
+    float h_beta = static_cast<float>(beta);
+    void *a = static_cast<void *>(&h_alpha);
+    void *b = static_cast<void *>(&h_beta);
+    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
+    if (FLAGS_gemm_use_half_precision_compute_type == true &&
+        std::is_same<T, phi::dtype::float16>::value) {
+      a = static_cast<void *>(&alpha);
+      b = static_cast<void *>(&beta);
+      compute_type = CUDA_R_16F;
+    }
+
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2358,7 +2392,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                                 N,
                                                                 M,
                                                                 K,
-                                                                &alpha,
+                                                                a,
                                                                 B,
                                                                 fp,
                                                                 ldb,
@@ -2367,13 +2401,13 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                                 fp,
                                                                 lda,
                                                                 strideA,
-                                                                &beta,
+                                                                b,
                                                                 C,
                                                                 fp,
                                                                 ldc,
                                                                 strideC,
                                                                 batchCount,
-                                                                fp,
+                                                                compute_type,
                                                                 algo));
     });
   } else {
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 3e06b69278d347c98ccc8e30fd53a8dfea211db7..f6f62045b19f9f62be5f71ac7637b0c3516fbe1b 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -495,6 +495,58 @@ class TestMatMulV2API(unittest.TestCase):
                     y = paddle.to_tensor(input_y)
                     result = paddle.matmul(x, y)
 
+    def test_compute_type_fp32(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': False
+                    })
+                    input_x = np.random.random([2, 8, 16]).astype("float16")
+                    input_y = np.random.random([2, 16, 8]).astype("float16")
+                    for i in range(0, 16, 2):
+                        input_x[:, :, i] += 60000
+                        input_x[:, :, i + 1] -= 60000
+                    input_y[:, :, :] = 1.5
+
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
+                    result_np = np.matmul(input_x, input_y)
+                    self.assertTrue(paddle.isfinite(result)[0, 0, 0])
+                    self.assertTrue(np.isfinite(result_np)[0, 0, 0])
+                    self.assertTrue(np.array_equal(result_np, result.numpy()))
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': True
+                    })
+
+    def test_compute_type_fp16_nan(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': True
+                    })
+                    input_x = np.random.random([2, 8, 16]).astype("float16")
+                    input_y = np.random.random([2, 16, 8]).astype("float16")
+                    for i in range(0, 16, 2):
+                        input_x[:, :, i] += 60000
+                        input_x[:, :, i + 1] -= 60000
+                    input_y[:, :, :] = 1.5
+
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
+                    result_np = np.matmul(input_x, input_y)
+                    self.assertFalse(
+                        paddle.isfinite(result)[0, 0, 0])  # contains nan/inf
+                    self.assertTrue(np.isfinite(result_np)[0, 0, 0])
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': False
+                    })
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_dygraph()