From bf4d17924477733ddb4bf233732901d681d19561 Mon Sep 17 00:00:00 2001 From: lzy <569782149@qq.com> Date: Tue, 29 Nov 2022 20:52:01 +0800 Subject: [PATCH] fix mma_tensorcore (#48386) * fix mma_tensorcore (__CUDA_ARCH__) * disable tensorcore by default. disable tensorcore by default, because the judgment of __CUDA_ARCH__ will cause undefined behavior in some environments, can manually enable it on a machine that supports tensorcore. --- .../operators/fused/fused_multi_transformer_op.cu.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index c36ee69723..3c3a59b219 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -95,7 +95,7 @@ using float16 = plat::float16; #define MMHA_USE_FP32_ACUM_FOR_LOGITS #define MMHA_USE_FP32_ACUM_FOR_OUT #define MMHA_USE_FP32_ACUM_FOR_FMA -#define MMHA_USE_HMMA_FOR_REDUCTION +// #define MMHA_USE_HMMA_FOR_REDUCTION template class PDDataTypeTraits; @@ -601,7 +601,8 @@ template inline __device__ float qk_hmma_dot_(const uint32_t (&q)[N], const uint32_t (&k)[N], float inv_sqrt_dh) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750 +#if defined(MMHA_USE_HMMA_FOR_REDUCTION) && defined(__CUDA_ARCH__) && \ + __CUDA_ARCH__ >= 750 #ifdef MMHA_USE_FP32_ACUM_FOR_FMA using K_vec_acum = typename K_vec_acum_fp32_::Type; #else @@ -641,7 +642,8 @@ struct Qk_dot { static inline __device__ float dot(const uint32_t (&q)[N], const uint32_t (&k)[N], float inv_sqrt_dh) { -#if defined(MMHA_USE_HMMA_FOR_REDUCTION) && __CUDA_ARCH__ >= 750 +#if defined(MMHA_USE_HMMA_FOR_REDUCTION) && defined(__CUDA_ARCH__) && \ + __CUDA_ARCH__ >= 750 return qk_hmma_dot_(q, k, inv_sqrt_dh); #else return qk_dot_<4>(q, k, inv_sqrt_dh); @@ -1104,7 +1106,8 @@ void fmha_launch_kernel(const Masked_multihead_attention_params ¶ms, if (params.timestep < 32) { MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream); } else if (params.timestep < 2048) { -#if defined(MMHA_USE_HMMA_FOR_REDUCTION) && __CUDA_ARCH__ >= 750 +#if defined(MMHA_USE_HMMA_FOR_REDUCTION) && defined(__CUDA_ARCH__) && \ + __CUDA_ARCH__ >= 750 MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 256, stream); #else MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream); -- GitLab