From 8a717a3e98aed104816b47eb30364602bb1832f7 Mon Sep 17 00:00:00 2001 From: ZZK <359521840@qq.com> Date: Wed, 30 Nov 2022 13:30:02 +0800 Subject: [PATCH] Support more activation in fused multi transformer (#48371) * add activation support * fix cublasLt bug * remove useless code and fix test random range --- .../fused/fused_multi_transformer_op.cc | 12 +- .../fused/fused_multi_transformer_op.cu | 50 +++--- .../fused/fused_multi_transformer_op.cu.h | 142 ++++++++++-------- .../test_fused_multi_transformer_op.py | 53 +++++-- 4 files changed, 157 insertions(+), 100 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc index 09c3dfe24c..6a4c3890e5 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc @@ -270,7 +270,17 @@ class FusedMultiTransformerOpOpMaker "dropout_implementation can only be downgrade_in_infer or " "upscale_in_train")); }); - AddAttr("act_method", "act_method").SetDefault("gelu"); + AddAttr("act_method", "act_method") + .SetDefault("gelu") + .AddCustomChecker([](const std::string &act_type) { + PADDLE_ENFORCE_EQ( + act_type == "gelu" || act_type == "relu" || act_type == "none", + true, + platform::errors::InvalidArgument( + "Only support `gelu`, `relu`, `none` activation in " + "FusedMultiTransformer. ")); + }); + AddAttr( "trans_qkvw", "Whether the weights of qkv should be transposed. If true," diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index f56baef1d2..aeb00a7947 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -31,6 +31,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { int seq_len = input_x_dims[1]; int dim_embed = input_x_dims[2]; int bsz_seq = bsz * seq_len; + const std::string act_method = ctx.Attr("act_method"); // 1. layer norm const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); @@ -61,7 +62,6 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr; // (transA, transB, compute_bias) = (false, trans_qkvw, false) - // Since we fused QKVBias into QKVBiasAddTransposeSplit kernel, here we set // compute_bias as false. auto qkv_compute = AttnMatMul(dev_ctx, @@ -191,24 +191,23 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { auto *dropout_mask_out_data = dev_ctx.Alloc( &dropout_mask_out, dropout_mask_out.numel() * sizeof(uint8_t)); - // 6. ffn1 matmul + bias_add + gelu. + // 6. ffn1 matmul + act + bias auto ffn1_weights = ctx.MultiInput("FFN1Weight"); auto ffn1_biases = ctx.MultiInput("FFN1Bias"); auto ffn1_weight_dim = ffn1_weights[0]->dims(); int dim_ffn = ffn1_weight_dim[1]; + auto ffn1_cublas_linear = CublasFusedMLP(dev_ctx); + const phi::DDim ffn1_input_shape({bsz_seq, dim_embed}); + ffn1_cublas_linear.Setup(ffn1_input_shape, ffn1_weight_dim, false, false); + Tensor ffn1_out; ffn1_out.Resize({{bsz_seq, dim_ffn}}); auto *ffn1_out_data = dev_ctx.Alloc(&ffn1_out, ffn1_out.numel() * sizeof(T)); - auto ffn1_linear_bias_gelu = CublasFusedMLP(dev_ctx); - const phi::DDim ffn1_input_shape({bsz_seq, dim_ffn}); - ffn1_linear_bias_gelu.Setup( - ffn1_input_shape, ffn1_weight_dim, false, false); - - // 8. ffn2 matmul + bias_add + residual. + // 7. ffn2 matmul + bias + residual. auto ffn2_weights = ctx.MultiInput("FFN2Weight"); auto ffn2_biases = ctx.MultiInput("FFN2Bias"); @@ -216,7 +215,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { ffn2_linear_bias_residual.Setup( ffn1_out.dims(), ffn2_weights[0]->dims(), false, false); - // 9. ffn2 residual bias + // 8. ffn2 Layernorm DropoutParam ffn2_dropout_param(true, 0, true, true, 0.0, nullptr, 0); FusedDropoutLayerNormHelper ffn2_fused_dropout_helper( dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon); @@ -333,7 +332,6 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { &attn_dropout_out, &qktv_out, &fmha_out); - const T *k_ptr = nullptr; const T *v_ptr = nullptr; @@ -450,20 +448,23 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { ln_mean_data, ln_var_data); } - #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step5"; #endif - // step6. ffn1 matmul + bias_add + gelu. - ffn1_linear_bias_gelu.ComputeForward( - buf1, ffn1_weights[i], ffn1_biases[i], nullptr, &ffn1_out, "gelu"); + // step6. ffn matmul1 + ffn1_cublas_linear.ComputeForward(buf1, + ffn1_weights[i], + ffn1_biases[i], + nullptr, + &ffn1_out, + act_method); #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step6"; #endif - // step7. ffn2 matmul + bias_add + residual. + // step7. ffn2 matmul if (pre_layer_norm) { ffn2_linear_bias_residual.ComputeForward(&ffn1_out, ffn2_weights[i], @@ -477,18 +478,21 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { &ffn1_out, ffn2_weights[i], ffn2_biases[i], buf1, buf0, "none"); } +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step7"; +#endif + if (pre_layer_norm) { AllReduce(*buf1, ring_id, buf1->numel(), dev_ctx); } else { AllReduce(*buf0, ring_id, buf0->numel(), dev_ctx); } - #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER - VLOG(0) << "step7"; + VLOG(0) << "step7.1"; #endif - // step8. layer norm or do nothing(because bias_add + residual has been - // fused into cublasFusedMLP. ) + // step8. layer norm or do nothing + // because bias_add + residual has been fused into cublasFusedMLP if (pre_layer_norm) { if (i < layers - 1) { auto *ln_scale_data = ln_scales[i + 1]->data(); @@ -512,6 +516,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { ln_mean_data, ln_var_data); } + #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step8"; #endif @@ -540,6 +545,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { int seq_len = input_x_dims[1]; int dim_embed = input_x_dims[2]; int bsz_seq = bsz * seq_len; + const std::string act_method = ctx.Attr("act_method"); // 1. layer norm const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); @@ -570,8 +576,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr; // (transA, transB, compute_bias) = (false, trans_qkvw, false) - // Since we fused QKVBias into QKVBiasAddTransposeSplit kernel, here we set - // compute_bias as false. + // Since we fused QKVBias into QKVBiasAddTransposeSplit kernel, here we + // set compute_bias as false. auto qkv_compute = AttnMatMul(dev_ctx, false, trans_qkvw, @@ -979,7 +985,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { fused_act_dropout_helper.DropoutActBias(dev_ctx, ffn1_out_data, ffn1_biases[i]->data(), - "gelu", + act_method, ffn1_dropout_out_data, ffn1_dropout_mask_data); #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index 3c3a59b219..69ac06206c 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -1414,14 +1414,15 @@ class CublasFusedMLP { public: // (m, n, k) = bsz_seq, hidden_feature, in_feature explicit CublasFusedMLP(const phi::GPUContext &dev_ctx) : dev_ctx_(dev_ctx) { - // Set Math Type cudaDataType_t mat_type = CUDA_R_32F; cudaDataType_t scale_type = CUDA_R_32F; cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; - if (std::is_same::value) { mat_type = CUDA_R_16F; if (FLAGS_gemm_use_half_precision_compute_type) { + // This option default value is true, it tends to result NaN, but get + // better inference speed. you can turn off by using `export + // FLAGS_gemm_use_half_precision_compute_type=0`. compute_type = CUBLAS_COMPUTE_16F; scale_type = CUDA_R_16F; } @@ -1435,7 +1436,6 @@ class CublasFusedMLP { compute_type = CUBLAS_COMPUTE_64F; } - // Just for init. PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( &operation_desc_, compute_type, scale_type)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( @@ -1445,7 +1445,6 @@ class CublasFusedMLP { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( &out_desc_, mat_type, 1, 1, 1)); } - ~CublasFusedMLP() { PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescDestroy(operation_desc_)); @@ -1457,7 +1456,6 @@ class CublasFusedMLP { platform::dynload::cublasLtMatrixLayoutDestroy(out_desc_)); } - // Change to use tensor's shape. void Setup(const phi::DDim &x_shape, const phi::DDim &w_shape, bool trans_x, @@ -1481,39 +1479,34 @@ class CublasFusedMLP { &cublas_transB, sizeof(cublas_transB))); - /* - cublas use col major: x(M, K) matmul w(K, N) = out(M, N) equals to w_t(N, K) - * x_t(K, M) = out(N, M) - */ - SetCublasMatrixLayout_(x_desc_, cublas_transA, K, M); - SetCublasMatrixLayout_(w_desc_, cublas_transB, N, K); - SetCublasMatrixLayout_(out_desc_, CUBLAS_OP_N, N, M); + SetCublasMatrixLayout(x_desc_, trans_x, M, K); + SetCublasMatrixLayout(w_desc_, trans_w, K, N); + SetCublasMatrixLayout(out_desc_, false, M, N); } - void ComputeForward(const phi::DenseTensor *input, + void ComputeForward(const phi::DenseTensor *x, const phi::DenseTensor *weight, const phi::DenseTensor *bias, phi::DenseTensor *residual, phi::DenseTensor *output, const std::string &activation) { - // here: (transa, transb): nt, input * weight. - // (M * K) * (K * N) - cublasLtHandle_t lt_handle = dev_ctx_.cublaslt_handle(); - size_t workspace_size = static_cast(16) * 1024 * 1024; - cudaStream_t stream = dev_ctx_.stream(); - memory::allocation::AllocationPtr workspace = - memory::Alloc(dev_ctx_.GetPlace(), - workspace_size, - phi::Stream(reinterpret_cast(stream))); + T *out_data = output->data(); const bool add_residual = (residual == nullptr) ? false : true; const bool add_bias = (bias == nullptr) ? false : true; + + const T *bias_data = nullptr; if (add_bias) { - SetCublasBiasPtr_(bias); + bias_data = bias->data(); } + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc_, + CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &bias_data, + sizeof(bias_data))); - // Set cublasLt epilogue. - cublasLtEpilogue_t epiloque_func = GetEpilogueType_(activation, add_bias); + cublasLtEpilogue_t epiloque_func = GetEpilogueType(activation, add_bias); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( operation_desc_, @@ -1521,25 +1514,44 @@ class CublasFusedMLP { &epiloque_func, sizeof(epiloque_func))); - const auto *x_data = input->data(); - const auto *w_data = weight->data(); - auto *residual_data = - add_residual ? residual->data() : output->data(); - auto *out_data = output->data(); + T *residual_data = add_residual ? residual->data() : out_data; + + cublasLtHandle_t lt_handle = dev_ctx_.cublaslt_handle(); + size_t workspace_size = static_cast(4) * 1024 * 1024; + cudaStream_t stream = dev_ctx_.stream(); + memory::allocation::AllocationPtr workspace = memory::Alloc( + dev_ctx_.GetPlace(), + workspace_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); - // if add_residual, we compute result + 1.0 * residual, else result + 0.0 * - // out. + // if add_residual, we compute result + 1.0 * residual, + // else result + 0.0 * out. double alpha64 = 1.0, beta64 = add_residual ? 1.0 : 0.0; float alpha32 = 1.0f, beta32 = add_residual ? 1.0f : 0.0f; + half alpha16 = static_cast(1.0), + beta16 = + add_residual ? static_cast(1.0) : static_cast(0.0); + void *alpha = nullptr, *beta = nullptr; if (std::is_same::value) { alpha = &alpha64; beta = &beta64; + } else if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else if (std::is_same::value) { + alpha = &alpha16; + beta = &beta16; } else { - alpha = &alpha32; - beta = &beta32; + PADDLE_ENFORCE_EQ(true, + false, + platform::errors::InvalidArgument( + "Only support double, float, half data type. ")); } + const auto *x_data = x->data(); + const auto *w_data = weight->data(); + auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(lt_handle, operation_desc_, w_desc_, @@ -1567,15 +1579,15 @@ class CublasFusedMLP { out_desc_, out_data, out_desc_, - algo /*algo*/, - workspace->ptr() /*workspace*/, + algo, + workspace->ptr(), workspace_size, stream)); } private: - static cublasLtEpilogue_t GetEpilogueType_(const std::string &activation, - const bool add_bias) { + cublasLtEpilogue_t GetEpilogueType(const std::string &activation, + const bool add_bias) { if (activation == "relu") { if (add_bias) { return CUBLASLT_EPILOGUE_RELU_BIAS; @@ -1606,23 +1618,41 @@ class CublasFusedMLP { } } - void SetCublasMatrixLayout_(cublasLtMatrixLayout_t layout_desc, - cublasOperation_t cublas_trans, - const size_t cublas_m, - const size_t cublas_n) { + void SetCublasMatrixLayout(cublasLtMatrixLayout_t layout_desc, + const bool transpose, + const uint64_t cublas_row, + const uint64_t cublas_col) { + cudaDataType_t mat_type = CUDA_R_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_16BF; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + } + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutSetAttribute( + layout_desc, + CUBLASLT_MATRIX_LAYOUT_TYPE, + &mat_type, + sizeof(mat_type))); + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatrixLayoutSetAttribute( layout_desc, CUBLASLT_MATRIX_LAYOUT_ROWS, - cublas_trans == CUBLAS_OP_N ? &cublas_m : &cublas_n, - sizeof(cublas_m))); + transpose ? &cublas_row : &cublas_col, + sizeof(cublas_row))); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatrixLayoutSetAttribute( layout_desc, CUBLASLT_MATRIX_LAYOUT_COLS, - cublas_trans == CUBLAS_OP_N ? &cublas_n : &cublas_m, - sizeof(cublas_m))); - const size_t cublas_ld = cublas_trans == CUBLAS_OP_N ? cublas_m : cublas_n; + transpose ? &cublas_col : &cublas_row, + sizeof(cublas_col))); + int64_t cublas_ld = transpose ? cublas_row : cublas_col; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatrixLayoutSetAttribute( layout_desc, @@ -1631,21 +1661,11 @@ class CublasFusedMLP { sizeof(cublas_ld))); } - void SetCublasBiasPtr_(const phi::DenseTensor *bias) { - const T *bias_data = bias->data(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cublasLtMatmulDescSetAttribute( - operation_desc_, - CUBLASLT_MATMUL_DESC_BIAS_POINTER, - &bias_data, - sizeof(bias_data))); - } - const phi::GPUContext &dev_ctx_; - cublasLtMatmulDesc_t operation_desc_; - cublasLtMatrixLayout_t x_desc_; - cublasLtMatrixLayout_t w_desc_; - cublasLtMatrixLayout_t out_desc_; + cublasLtMatmulDesc_t operation_desc_ = NULL; + cublasLtMatrixLayout_t x_desc_ = NULL; + cublasLtMatrixLayout_t w_desc_ = NULL; + cublasLtMatrixLayout_t out_desc_ = NULL; }; #endif // PADDLE_FLUID_OPERATORS_FUSED_FUSED_MULTI_TRANSFORMER_OP_CU_H_ diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py index 199c1e48bb..8aadeba437 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py @@ -124,6 +124,7 @@ class TestFusedMultiTransformerOp(OpTest): self.training = False self.layers = 4 + self.batch_size = 8 self.query_length = 128 self.cache_length = 128 @@ -144,21 +145,27 @@ class TestFusedMultiTransformerOp(OpTest): ) def generate_input_data(self): - self.query = np.random.rand( - self.batch_size, self.query_length, self.embed_dim + self.query = np.random.uniform( + -1, 1, (self.batch_size, self.query_length, self.embed_dim) ).astype(self.x_type) + out_seq_len = self.key_length if self.has_cache_kv: assert self.training is False, ValueError( 'cache_kv can only used in inference' ) - self.cache_kv = np.random.rand( - 2, - self.batch_size, - self.num_heads, - self.cache_length, - self.head_dim, + self.cache_kv = np.random.uniform( + -1, + 1, + ( + 2, + self.batch_size, + self.num_heads, + self.cache_length, + self.head_dim, + ), ).astype(self.x_type) + if self.gen_cache_kv: self.cache_kv[:] = 0 else: @@ -168,12 +175,16 @@ class TestFusedMultiTransformerOp(OpTest): if self.has_pre_cache: out_seq_len += self.pre_cache_num - self.pre_cache_kv = np.random.rand( - 2, - self.batch_size, - self.num_heads, - self.pre_cache_num, - self.head_dim, + self.pre_cache_kv = np.random.uniform( + -1, + 1, + ( + 2, + self.batch_size, + self.num_heads, + self.pre_cache_num, + self.head_dim, + ), ).astype(self.x_type) if self.has_attn_mask: @@ -204,8 +215,8 @@ class TestFusedMultiTransformerOp(OpTest): self.attn_mask = None self.key, self.value = self.query, self.query - self.dout = np.random.random( - (self.batch_size, self.query_length, self.embed_dim) + self.dout = np.random.uniform( + -1, 1, (self.batch_size, self.query_length, self.embed_dim) ).astype(self.x_type) def GetBaselineOut(self): @@ -544,6 +555,7 @@ class TestFusedMultiTransformerOp(OpTest): time_step=time_step, attn_mask=attn_mask, dropout_rate=self.dropout_prob, + activation=self.act_method, training=self.training, ) @@ -668,6 +680,7 @@ class TestFusedMultiTransformerOp(OpTest): self.num_heads, 4 * self.embed_dim, self.dropout_prob, + activation=self.act_method, normalize_before=self.pre_layer_norm, ln_scale_attrs=ln_scales_attr, ln_bias_attrs=ln_biases_attr, @@ -797,6 +810,14 @@ class TestFusedMultiTransformerOpFp16(TestFusedMultiTransformerOp): self.layers = 3 # odd layers +class TestFusedMultiTransformerOpActReluFp16(TestFusedMultiTransformerOp): + def config(self): + super().config() + self.x_type = np.float16 + self.act_method = "relu" + self.layers = 3 # odd layers + + class TestFusedMultiTransformerOpCacheKV(TestFusedMultiTransformerOp): def config(self): super().config() -- GitLab