diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu index 7fad2871827a38274b760052a30e780e2cdf208c..d516d54e4d2a21d0d1c347221903d2394d5c5125 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -52,8 +52,8 @@ phi::funcs::MatmulFusedType GetFwdFusedEpilogueType( } } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Fued linear epilogue type should be one of {none, relu, gelu}." - "But received activation is %s, please check", + "fused_gemm_epilogue's activate should be one of {none, relu, gelu}," + " but received %s, please check", activation)); } } diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index e35bce723ca168d5fa4c5f281ea726418ed9c250..5c9c010d365e4e6256713fe9db8d57417172b794 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -25,10 +25,10 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/phi/api/ext/exception.h" +#include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_resources.h" -#include "paddle/phi/common/float16.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/cuda_stream.h" @@ -601,7 +601,7 @@ struct GPUContext::Impl { #endif #endif }); - if (blas_tf32_tensor_core_handle_ != nullptr) { + if (blas_tf32_tensor_core_handle_ && phi::AllowTF32Cublas()) { std::lock_guard guard(blas_tf32_mtx_); callback(blas_tf32_tensor_core_handle_); } else { diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h index fa96ed67a29fd525228d5e9b4b06749f31624ab8..a6a37272840afb06128ee9d51bcee96f49eb9fde 100644 --- a/paddle/phi/kernels/autotune/auto_tune_base.h +++ b/paddle/phi/kernels/autotune/auto_tune_base.h @@ -29,7 +29,7 @@ class KernelCallback { using FuncType = ReturnType (*)(Args...); KernelCallback() {} - explicit KernelCallback(FuncType func_) : func(func_) {} + explicit KernelCallback(FuncType f) : func(f) {} virtual ~KernelCallback() {} ReturnType Run(Args... args) { return func(args...); } @@ -50,8 +50,8 @@ class AutoTuneBase { AutoTuneBase() {} virtual ~AutoTuneBase() {} - explicit AutoTuneBase(KernelType kernel) { - kernels_.push_back(/*default=*/kernel); + explicit AutoTuneBase(KernelType default_kernel) { + kernels_.push_back(default_kernel); } template @@ -121,7 +121,7 @@ class AutoTuneBase { float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) { // Regard 1st run as warmup, judge the compare result by the time cost // of rest cycles. - constexpr int repeats = 6; + constexpr int repeats = 11; phi::GpuTimer timer; float time_cost = 0; const auto& stream = ctx.stream(); diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h index ee3709e967d893be6091d6fe1c442e9d559751e5..b46608e91b74aafc70100dab7a285b60647ed4c0 100644 --- a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h @@ -25,8 +25,11 @@ limitations under the License. */ #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/flags.h" #include "paddle/phi/kernels/autotune/gpu_timer.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" + +PHI_DECLARE_int64(cublaslt_exhaustive_search_times); #endif namespace phi { @@ -41,21 +44,42 @@ namespace funcs { // no matter forward or backward, they could share the same descriptor // cache, in that the descriptor is for description of matmul operation. enum MatmulFusedType { - kMatmul = CUBLASLT_EPILOGUE_DEFAULT, - kMatmulGrad = CUBLASLT_EPILOGUE_DEFAULT, - kMatmulGradWithoutBias = CUBLASLT_EPILOGUE_DEFAULT, - kMatmulBias = CUBLASLT_EPILOGUE_BIAS, - kMatmulRelu = CUBLASLT_EPILOGUE_RELU, - kMatmulBiasRelu = CUBLASLT_EPILOGUE_RELU_BIAS, - kMatmulBiasGelu = CUBLASLT_EPILOGUE_GELU_BIAS, - kMatmulBiasReluWithReservedData = CUBLASLT_EPILOGUE_RELU_AUX_BIAS, - kMatmulBiasGeluWithReservedData = CUBLASLT_EPILOGUE_GELU_AUX_BIAS, - kMatmulReluGrad = CUBLASLT_EPILOGUE_DRELU, - kMatmulGeluGrad = CUBLASLT_EPILOGUE_DGELU, - kMatmulBiasGradToA = CUBLASLT_EPILOGUE_BGRADA, - kMatmulBiasGradToB = CUBLASLT_EPILOGUE_BGRADB + kMatmul = 0, + kMatmulGrad = 1, + kMatmulGradWithoutBias = 2, + kMatmulBias = 3, + kMatmulRelu = 4, + kMatmulBiasRelu = 5, + kMatmulBiasGelu = 6, + kMatmulBiasReluWithReservedData = 7, + kMatmulBiasGeluWithReservedData = 8, + kMatmulReluGrad = 9, + kMatmulGeluGrad = 10, + kMatmulBiasGradToA = 11, + kMatmulBiasGradToB = 12 }; +static cublasLtEpilogue_t ConvertFusedType(MatmulFusedType fused_type) { + static std::map fused_type_map = { + {MatmulFusedType::kMatmul, CUBLASLT_EPILOGUE_DEFAULT}, + {MatmulFusedType::kMatmulGrad, CUBLASLT_EPILOGUE_DEFAULT}, + {MatmulFusedType::kMatmulGradWithoutBias, CUBLASLT_EPILOGUE_DEFAULT}, + {MatmulFusedType::kMatmulBias, CUBLASLT_EPILOGUE_BIAS}, + {MatmulFusedType::kMatmulRelu, CUBLASLT_EPILOGUE_RELU}, + {MatmulFusedType::kMatmulBiasRelu, CUBLASLT_EPILOGUE_RELU_BIAS}, + {MatmulFusedType::kMatmulBiasGelu, CUBLASLT_EPILOGUE_GELU_BIAS}, + {MatmulFusedType::kMatmulBiasReluWithReservedData, + CUBLASLT_EPILOGUE_RELU_AUX_BIAS}, + {MatmulFusedType::kMatmulBiasGeluWithReservedData, + CUBLASLT_EPILOGUE_GELU_AUX_BIAS}, + {MatmulFusedType::kMatmulReluGrad, CUBLASLT_EPILOGUE_DRELU}, + {MatmulFusedType::kMatmulGeluGrad, CUBLASLT_EPILOGUE_DGELU}, + {MatmulFusedType::kMatmulBiasGradToA, CUBLASLT_EPILOGUE_BGRADA}, + {MatmulFusedType::kMatmulBiasGradToB, CUBLASLT_EPILOGUE_BGRADB}}; + + return fused_type_map[fused_type]; +} + enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 }; template @@ -125,31 +149,31 @@ struct MatmulPlanner { const bool trans_x, const bool trans_y, phi::DataType dtype, - MatmulFusedType impl_type, + MatmulFusedType fused_type, const void* bias_data = nullptr, void* reserve_data = nullptr, // Commonly for ReLu bit-mask. bool use_addto = false, bool no_exchange = true) - : bias(bias_data), aux_data(reserve_data), impl_type_(impl_type) { + : bias(bias_data), aux_data(reserve_data), fused_type_(fused_type) { use_addto_ = use_addto; key_ = phi::autotune::GenKey(x_dims, y_dims, static_cast(trans_x), static_cast(trans_y), static_cast(dtype), + static_cast(fused_type_), + static_cast(use_addto_), static_cast(no_exchange)); } bool UseAddTo() const { return use_addto_; } size_t GetKey() const { return key_; } - MatmulFusedType ImplType() const { return impl_type_; } + MatmulFusedType GetFusedType() const { return fused_type_; } - size_t GenSubKey(int idx) const { - return phi::autotune::GenKey(key_, static_cast(use_addto_), idx); - } + size_t GenSubKey() const { return key_; } private: - MatmulFusedType impl_type_; + MatmulFusedType fused_type_; bool use_addto_; size_t key_; }; @@ -265,23 +289,28 @@ struct MatmulDescriptor { bool has_algo = true) const { std::ostringstream out; out << prefix << " \n"; -#define GET_DESC_DATA_INFO(src) \ +#define GET_DESC_DATA_STRING(src) \ do { \ - out << #src << "= ["; \ + out << " " << #src << " = ["; \ int num = sizeof((*src)) / sizeof(src->data[0]); \ for (int i = 0; i < num; ++i) { \ - out << src->data[i] << ", "; \ + if (i == 0) { \ + out << src->data[i]; \ + } else { \ + out << ", " << src->data[i]; \ + } \ } \ out << "]\n"; \ } while (0); if (has_algo) { - GET_DESC_DATA_INFO(&algo); + GET_DESC_DATA_STRING(algo); } - GET_DESC_DATA_INFO(x_desc); - GET_DESC_DATA_INFO(y_desc); - GET_DESC_DATA_INFO(out_desc); - GET_DESC_DATA_INFO(op_desc); + GET_DESC_DATA_STRING(x_desc); + GET_DESC_DATA_STRING(y_desc); + GET_DESC_DATA_STRING(out_desc); + GET_DESC_DATA_STRING(op_desc); +#undef GET_DESC_DATA_STRING return out.str(); } @@ -304,12 +333,13 @@ struct MatmulDescriptor { CUBLASLT_MATMUL_DESC_TRANSA, &cublas_trans_y, sizeof(cublas_trans_y))); - if (planner->ImplType() != kMatmul) { - auto fused_type = static_cast(planner->ImplType()); + MatmulFusedType fused_type = planner->GetFusedType(); + if (fused_type != MatmulFusedType::kMatmul) { + cublasLtEpilogue_t cublaslt_fused_type = ConvertFusedType(fused_type); PADDLE_ENFORCE_GPU_SUCCESS( dynload::cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, - &fused_type, + &cublaslt_fused_type, sizeof(fused_type))); } if (planner->aux_data) { @@ -452,7 +482,7 @@ struct CublasLtBase { } } - VLOG(6) << desc->GetDescResultString("[Impl CublasltDescriptor] "); + VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] "); PADDLE_ENFORCE_GPU_SUCCESS( dynload::cublasLtMatmul(cublaslt_handle, desc->op_desc, @@ -482,10 +512,6 @@ struct CublasLtBase { void* out_data, void* workspace_ptr, size_t workspace_size) { - cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo(); - const auto& stream = ctx.stream(); - int returned_results = 0; - constexpr int requested_algo_count = 10; cublasLtMatmulPreference_t preference; PADDLE_ENFORCE_GPU_SUCCESS( dynload::cublasLtMatmulPreferenceCreate(&preference)); @@ -494,6 +520,9 @@ struct CublasLtBase { CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); + + int returned_results = 0; + constexpr int requested_algo_count = 10; std::vector heuristic_results( requested_algo_count); PADDLE_ENFORCE_GPU_SUCCESS( @@ -510,52 +539,90 @@ struct CublasLtBase { PADDLE_ENFORCE_GT(returned_results, 0, phi::errors::Unavailable("No GEMM algorithm avaliable.")); - phi::GpuTimer timer; int best_algo_idx = -1; - constexpr int repeats = 6; - float min_time_cost = std::numeric_limits::max(); - for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { - ctx.Wait(); - float cur_time = 0.f; - for (int i = 0; i < repeats; ++i) { - timer.Start(stream); - PADDLE_ENFORCE_GPU_SUCCESS( - dynload::cublasLtMatmul(lt_handle, - desc->op_desc, - alpha, - y_data, - desc->y_desc, - x_data, - desc->x_desc, - beta, - out_data, - desc->out_desc, - out_data, - desc->out_desc, - &(heuristic_results[algo_idx].algo), - workspace_ptr, - workspace_size, - stream)); - timer.Stop(stream); - auto time = timer.ElapsedTime(); - if (i > 0) { - cur_time += time; + if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) { + best_algo_idx = 0; + } else { + float min_time_cost = std::numeric_limits::max(); + for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { + float cur_time_cost = + RunAndMeasureAlgo(ctx, + lt_handle, + desc, + alpha, + beta, + y_data, + x_data, + out_data, + workspace_ptr, + workspace_size, + &(heuristic_results[algo_idx].algo)); + VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx + << "] time: " << cur_time_cost << " s"; + + if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) || + (cur_time_cost < min_time_cost)) { + best_algo_idx = algo_idx; + min_time_cost = cur_time_cost; } } - float time_cnt = (cur_time / (repeats - 1)); - VLOG(6) << "Time cost in MatmulWithCublaslt algo[" << algo_idx << "]" - << "is : " << time_cnt << " s"; - - if (cur_time < min_time_cost) { - best_algo_idx = algo_idx; - min_time_cost = cur_time; - } } - VLOG(6) << "Best_algo_idx in MatmulWithCublaslt is : " << best_algo_idx; + VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx; + + cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo(); *best_algo = heuristic_results[best_algo_idx].algo; PADDLE_ENFORCE_GPU_SUCCESS( dynload::cublasLtMatmulPreferenceDestroy(preference)); } + + static float RunAndMeasureAlgo(const phi::GPUContext& ctx, + const cublasLtHandle_t& lt_handle, + MatmulDescT* desc, + const void* alpha, + const void* beta, + const void* y_data, + const void* x_data, + void* out_data, + void* workspace_ptr, + size_t workspace_size, + cublasLtMatmulAlgo_t* algo) { + int repeats = FLAGS_cublaslt_exhaustive_search_times; + if (repeats <= 0) { + return std::numeric_limits::max(); + } + + phi::GpuTimer timer; + float time_cost = 0.f; + const auto& stream = ctx.stream(); + + for (int i = 0; i < repeats; ++i) { + timer.Start(stream); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle, + desc->op_desc, + alpha, + y_data, + desc->y_desc, + x_data, + desc->x_desc, + beta, + out_data, + desc->out_desc, + out_data, + desc->out_desc, + algo, + workspace_ptr, + workspace_size, + stream)); + timer.Stop(stream); + ctx.Wait(); + auto time = timer.ElapsedTime(); + if (i > 0) { + // Exclude the warmup runtime. + time_cost += time; + } + } + return (time_cost / (repeats - 1)); + } }; // To judge if desc is cached or not. @@ -583,14 +650,14 @@ struct DescriptorSetter { const bool no_exchange = true, bool grad_for_dx = true) { if (planner != nullptr) { - sub_key = planner->GenSubKey(static_cast(planner->ImplType())); + sub_key = planner->GenSubKey(); } auto& mamtul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); if (mamtul_cache.FindSubKey(sub_key)) { desc = *(reinterpret_cast(mamtul_cache.GetSubKey(sub_key))); desc.template SetFusedEpiloguePtr(planner); - VLOG(6) << desc.GetDescResultString("[Heap CublasltDescriptor] "); + VLOG(7) << desc.GetDescResultString("[Heap CublasltDescriptor] "); } else { desc.template Create(M, N, @@ -607,7 +674,7 @@ struct DescriptorSetter { if (planner != nullptr) { desc.template SetFusedEpiloguePtr(planner); } - VLOG(6) << desc.GetDescResultString("[Stack CublasltDescriptor] ", false); + VLOG(7) << desc.GetDescResultString("[Stack CublasltDescriptor] ", false); } } }; diff --git a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h index ab0758e2e3ff4792dac3c2c46de7fb0b7b21bcd0..6f4eb46bf4eb740e1cf8b671a9cbdbcc9cdab672 100644 --- a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h +++ b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h @@ -945,75 +945,37 @@ void ComputeFusedGemmEpilogueBackward(const phi::GPUContext& dev_ctx, << ", trans_y=" << trans_y << ", activation_grad=" << activation_grad; +#define CALL_FUSED_GRAD_IMPL(TransX, TransY) \ + ComputeFusedGemmEpilogueBackwardImpl( \ + dev_ctx, \ + dout, \ + x, \ + y, \ + reserve_space, \ + M, \ + N, \ + K, \ + activation_grad, \ + dx, \ + dy, \ + dbias, \ + use_addto_dx, \ + use_addto_dy) + if (trans_x) { if (trans_y) { - ComputeFusedGemmEpilogueBackwardImpl( - dev_ctx, - dout, - x, - y, - reserve_space, - M, - N, - K, - activation_grad, - dx, - dy, - dbias, - use_addto_dx, - use_addto_dy); + CALL_FUSED_GRAD_IMPL(true, true); } else { - ComputeFusedGemmEpilogueBackwardImpl( - dev_ctx, - dout, - x, - y, - reserve_space, - M, - N, - K, - activation_grad, - dx, - dy, - dbias, - use_addto_dx, - use_addto_dy); + CALL_FUSED_GRAD_IMPL(true, false); } } else { if (trans_y) { - ComputeFusedGemmEpilogueBackwardImpl( - dev_ctx, - dout, - x, - y, - reserve_space, - M, - N, - K, - activation_grad, - dx, - dy, - dbias, - use_addto_dx, - use_addto_dy); + CALL_FUSED_GRAD_IMPL(false, true); } else { - ComputeFusedGemmEpilogueBackwardImpl( - dev_ctx, - dout, - x, - y, - reserve_space, - M, - N, - K, - activation_grad, - dx, - dy, - dbias, - use_addto_dx, - use_addto_dy); + CALL_FUSED_GRAD_IMPL(false, false); } } +#undef CALL_FUSED_GRAD_IMPL } } // namespace funcs diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h index 14f50786d0c86405152e48b378ec3f28477cbaba..786d97e38b96ec472adc970a6b7bd99ab217ddbd 100644 --- a/paddle/phi/kernels/impl/matmul_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h @@ -925,7 +925,11 @@ struct MatMulDispatcher { trans_x, trans_y, phi::CppTypeToDataType::Type(), - funcs::MatmulFusedType::kMatmul); + funcs::MatmulFusedType::kMatmul, + /* bias_data */ nullptr, + /* reserve_data */ nullptr, + /* use_addto */ flag, + /* no_exchange */ true); tuner->Run(ctx, matmul_planner.GetKey(), ctx,