diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 0a478a2d2c8ae69678c750643c42b9bd2c4bb106..c5b2cd6e201a58cda7c22c16cc99f340e55b49ff 100755 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -171,8 +171,9 @@ const std::vector kGpuLowerPrecisionPasses{ "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass", "gpu_cpu_map_matmul_v2_to_mul_pass", "gpu_cpu_map_matmul_v2_to_matmul_pass", + "gpu_cpu_map_matmul_to_mul_pass", "fc_fuse_pass", - "fc_elementwise_layernorm_fuse_pass", + // "fc_elementwise_layernorm_fuse_pass", "embedding_eltwise_layernorm_fuse_pass", "runtime_context_cache_pass", }; diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 87811b61306d9eaf565c6e39e8a5dfa94163b49b..c2cb6f46010daf521893c71e629ce672e87dcc3f 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -276,9 +276,9 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data, half tmp_0 = __hdiv(__hsub(save_ptr[save_index], mean_i), std_i); half tmp_1 = scale ? __hmul(scale[j], tmp_0) : tmp_0; #else - half tmp_0 = static_cast(static_cast(save_ptr[save_index]) - - static_cast(mean_i) / - static_cast(std_i)); + half tmp_0 = static_cast((static_cast(save_ptr[save_index]) - + static_cast(mean_i)) / + static_cast(std_i)); half tmp_1 = scale ? static_cast(static_cast(scale[j]) * static_cast(tmp_0)) : tmp_0; @@ -394,19 +394,16 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { auto* out_data = dev_ctx.template Alloc(out, out->numel() * sizeof(T)); auto blas = phi::funcs::GetBlas(dev_ctx); - blas.GEMM(false, - false, + blas.GEMM(CblasNoTrans, + CblasNoTrans, M, N, K, static_cast(1.0), x_data, - K, w_data, - N, static_cast(0.0), - out_data, - N); + out_data); auto* y = ctx.Input("Y"); auto* bias_0 = ctx.Input("Bias0"); auto* bias_1 = ctx.Input("Bias1"); diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index a7d4535d6df1adc603790a27ecb4f91ea479cd93..6fa9b640f1265272b4939bafaa9b2bd85e24c4cc 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -292,19 +292,16 @@ void FCFunctor::operator()(const DeviceContext& context, errors::PermissionDenied( "Weight padding in fc can not be used in GPU scope.")); auto blas = phi::funcs::GetBlas(context); - blas.GEMM(false, - false, + blas.GEMM(CblasNoTrans, + CblasNoTrans, M, N, K, static_cast(1.0), X, - K, W, - N, static_cast(0.0), - Y, - N); + Y); if (B == NULL) { return; }