From 35acfeda36caada80464051043a3f86ae2b76779 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Fri, 15 Apr 2022 10:57:42 +0800 Subject: [PATCH] Change cuDNN Conv kernel for auto tune feature (#41313) * change cudnn helper for auto-tune * Add FLAGS_use_autotune to set the global status of autotune and change the order of choosing algorithm. * Fix the bug in calculating and printing current step cache hit rate. * Improve the autotune cache and fix unittest. * Change the key from AlgorithmType to int64_t. * Fix unittest for cpu-only env. * change ChooseAlgoByWorkspace for heuristic mode Co-authored-by: Liu Yiqun --- paddle/fluid/eager/CMakeLists.txt | 2 +- paddle/fluid/imperative/CMakeLists.txt | 4 +- paddle/fluid/operators/conv_base_helper.h | 32 +- paddle/fluid/operators/conv_cudnn_helper.h | 727 ++++++++++-------- paddle/fluid/platform/flags.cc | 9 + paddle/fluid/pybind/pybind.cc | 6 +- paddle/phi/kernels/CMakeLists.txt | 14 +- paddle/phi/kernels/autotune/CMakeLists.txt | 9 +- paddle/phi/kernels/autotune/cache.cc | 37 + paddle/phi/kernels/autotune/cache.h | 96 ++- paddle/phi/kernels/autotune/cache_test.cc | 2 +- .../phi/kernels/autotune/switch_autotune.cc | 74 ++ paddle/phi/kernels/autotune/switch_autotune.h | 94 +-- .../tests/unittests/test_switch_autotune.py | 127 ++- 14 files changed, 731 insertions(+), 502 deletions(-) create mode 100644 paddle/phi/kernels/autotune/switch_autotune.cc diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index da326ff7d7..53ac895bfb 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -15,7 +15,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_subdirectory(pylayer) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) add_dependencies(grad_tensor_holder eager_final_state_codegen) - cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) + cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune) endif() cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 3d8a5ab21f..69cd45222c 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -9,8 +9,8 @@ cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_f add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper) -cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) -cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) +cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune) +cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune) cc_library(imperative_profiler SRCS profiler.cc DEPS flags) if(NOT WIN32) if(WITH_NCCL OR WITH_RCCL) diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h index c664d1935f..9e1a323fc9 100644 --- a/paddle/fluid/operators/conv_base_helper.h +++ b/paddle/fluid/operators/conv_base_helper.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/autotune/cache.h" namespace paddle { namespace operators { @@ -41,12 +42,22 @@ struct SearchAlgorithm {}; // As the container of searchAlgorithm::Find() result. template struct SearchResult { - public: + SearchResult() {} + explicit SearchResult(AlgoT a) : algo(a) {} + AlgoT algo = static_cast(0); float time = -1.f; size_t workspace_size = 0; }; +template +static std::ostream& operator<<(std::ostream& out, const std::vector& v) { + out << "["; + for (auto const& tmp : v) out << tmp << ","; + out << "]"; + return out; +} + // As the container of conv relevant descriptors. template struct ConvArgsBase { @@ -68,6 +79,17 @@ struct ConvArgsBase { const framework::Tensor* o, const std::vector s, const std::vector p, const std::vector d, DataT dtype) : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} + + template + size_t GetCacheKey() const { + auto x_shape = phi::vectorize(x->dims()); + auto w_shape = phi::vectorize(w->dims()); + VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape + << ", strides=" << s << ", paddings=" << p << ", dilations=" << d; + return phi::autotune::ConvKey( + x_shape, w_shape, p, s, d, + paddle::experimental::CppTypeToDataType::Type()); + } }; static inline void GetNCDHW(const framework::DDim& dims, @@ -87,13 +109,5 @@ static inline void GetNCDHW(const framework::DDim& dims, } } -template -static std::ostream& operator<<(std::ostream& out, const std::vector& v) { - out << "["; - for (auto const& tmp : v) out << tmp << ","; - out << "]"; - return out; -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 1311f812be..419fb8a4ca 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -17,6 +17,8 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_base_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace paddle { @@ -67,20 +69,16 @@ static inline bool UseFixedWorkspace() { return FLAGS_conv_workspace_size_limit >= 0; } -static size_t CaclWorkspaceLimitInBytes(const phi::GPUContext& ctx) { - if (!UseFixedWorkspace()) { +static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { + if (!use_fixed_workspace) { int device_id = platform::GetCurrentDeviceId(); int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id); int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id); int64_t availble = platform::GpuAvailableMemToAlloc(); - int64_t cur_workspace_size = ctx.cudnn_workspace_handle().WorkspaceSize(); VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) << " MB, reserved=" << ToMegaBytes(reserved) - << " MB, available_to_alloc=" << ToMegaBytes(availble) - << " MB, current_workspace_size=" << ToMegaBytes(cur_workspace_size) - << " MB."; - return std::max(std::max(availble, cur_workspace_size), - reserved - allocated); + << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; + return std::max(availble, reserved - allocated); } else { return FLAGS_conv_workspace_size_limit * 1024 * 1024; } @@ -104,26 +102,44 @@ std::string GetPerfResultString(std::string prefix, return out.str(); } +// Choose an algorithm which has the minimize time cost and less memory. +// NOTE: perf_results is ordered by time. template void ChooseAlgoByWorkspace(const std::vector& perf_results, size_t workspace_limit, - SearchResult* algo_result) { + SearchResult* search_result) { + int best_algo_idx = -1; for (size_t i = 0; i < perf_results.size(); ++i) { auto result = perf_results[i]; if (result.status == CUDNN_STATUS_SUCCESS && result.memory < workspace_limit) { - algo_result->algo = result.algo; - algo_result->time = result.time; - algo_result->workspace_size = result.memory; - VLOG(3) << " algo=" << result.algo << ", time=" << result.time - << " ms, memory=" << ToMegaBytes(result.memory) - << " MB (limit=" << ToMegaBytes(workspace_limit) - << " MB), status=" << result.status; - return; + if (best_algo_idx == -1) { + // The algorithm which has minimize time cost and need a workspace_size + // fitting the workspace_limit constraint. + best_algo_idx = i; + // Each perf_results[i].time is set to be -1 in heuristic search. + if (perf_results[best_algo_idx].time < 0) { + break; + } + } else { + float best_algo_time = perf_results[best_algo_idx].time; + if ((result.time - best_algo_time) / best_algo_time < 0.01) { + best_algo_idx = (result.memory < perf_results[best_algo_idx].memory) + ? i + : best_algo_idx; + break; + } + } } } - VLOG(3) << "Can not find an algorithm that requires memory < " - << ToMegaBytes(workspace_limit) << " MB"; + if (best_algo_idx != -1) { + search_result->algo = perf_results[best_algo_idx].algo; + search_result->time = perf_results[best_algo_idx].time; + search_result->workspace_size = perf_results[best_algo_idx].memory; + } else { + VLOG(3) << "Can not find an algorithm that requires memory < " + << ToMegaBytes(workspace_limit) << " MB"; + } } static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, @@ -151,6 +167,10 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, #endif } +// cuDNN convolution forward algorithm searcher, consisted of three searching +// modes, namely: deterministic, heuristic and exhaustive_search mode. +// As well as one workspace size acquirsition function with respect to +// the chosen alogrithm. template <> struct SearchAlgorithm { using PerfT = cudnnConvolutionFwdAlgoPerf_t; @@ -162,90 +182,30 @@ struct SearchAlgorithm { const phi::GPUContext& ctx) { SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - if (!exhaustive_search && !deterministic) { -#if CUDNN_VERSION >= 7001 - int actual_perf_count; - int best_algo_idx = 0; - std::vector perf_results(kNUM_CUDNN_FWD_ALGS); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, - &actual_perf_count, perf_results.data())); - result.algo = perf_results[best_algo_idx].algo; - result.workspace_size = perf_results[best_algo_idx].memory; - - if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 - // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results, workspace_size_limit, - &result); -#else - VLOG(3) << "Fallback to non-v7 method to find conv algorithm " - "becasue the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } else if (deterministic) { - result.algo = static_cast(1); + if (deterministic) { + result = FindAlgoDeterministic(); } else { - auto workspace_handle = ctx.cudnn_workspace_handle(); - auto x_dims = phi::vectorize(args.x->dims()); - auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::vector perf_results(kNUM_CUDNN_FWD_ALGS); - size_t max_workspace_size = - FindMaxWorkspaceSize(args, workspace_size_limit); - VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) - << " MB"; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - args.handle, args.idesc.desc(), args.x->data(), - args.wdesc.desc(), args.w->data(), args.cdesc.desc(), - args.odesc.desc(), const_cast(args.o->data()), - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - perf_results.data(), cudnn_workspace_ptr, - max_workspace_size)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, - UseFixedWorkspace()); - - VLOG(4) << GetPerfResultString( - "[Exhaustive Search] FwdAlgo Perf result", perf_results, - returned_algo_count, workspace_size_limit); - result.time = perf_results[0].time; - return perf_results[0].algo; - }); + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, runn heuristic search(default) before + // auto-tune process, run exhaustive_search during mentioned process. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + size_t key = args.GetCacheKey(); + auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward(); + if (cache.Find(key)) { + result.algo = static_cast(cache.Get(key)); + } else { + bool use_autotune = + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + result = FindAlgoExhaustiveSearch(args, ctx); + cache.Set(key, static_cast(result.algo)); + } else { + result = FindAlgoHeuristic(args, ctx); + } + } } VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search << ", deterministic=" << deterministic @@ -265,8 +225,95 @@ struct SearchAlgorithm { } private: - static size_t FindMaxWorkspaceSize(const ConvArgs& args, - size_t workspace_size_limit) { + static SearchResult FindAlgoDeterministic() { + return SearchResult(static_cast(1)); + } + + // Heuristic search mode, calling the cudnnGetXxxAlgorithm. + static SearchResult FindAlgoHeuristic(const ConvArgs& args, + const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + +#if CUDNN_VERSION >= 7001 + int actual_perf_count; + int best_algo_idx = 0; + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; + + if (result.workspace_size > workspace_size_limit) { +#if CUDNN_VERSION >= 8000 + // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); +#else + VLOG(3) << "Fallback to non-v7 method to find conv algorithm " + "becasue the workspace size request(" + << result.workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + } +#else + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, + &(result.algo))); +#endif + return result; + } + + template + static SearchResult FindAlgoExhaustiveSearch( + const ConvArgs& args, const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit); + VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; + + int returned_algo_count; + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); + auto cudnn_find_func = [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + args.handle, args.idesc.desc(), args.x->data(), + args.wdesc.desc(), args.w->data(), args.cdesc.desc(), + args.odesc.desc(), const_cast(args.o->data()), + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_results.data(), + workspace_ptr, max_workspace_size)); + }; + + auto workspace_handle = ctx.cudnn_workspace_handle(); + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] FwdAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); + + return result; + } + + static size_t GetMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { if (!UseFixedWorkspace()) { size_t max_workspace_size = 0; for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) { @@ -288,6 +335,12 @@ struct SearchAlgorithm { } }; +// cuDNN convolution backward data-algorithm searcher, consisting of three +// searching modes, namely: deterministic, heuristic, and exhaustive_search +// mode. Specially, there are 2 pattens of exhaustive search mode, one for +// HALF precision only, one for the rest. +// As well as one workspace size acquirsition function with +// respect to the chosen alogrithm. template <> struct SearchAlgorithm { using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; @@ -299,102 +352,31 @@ struct SearchAlgorithm { const phi::GPUContext& ctx) { SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - if (!exhaustive_search && !deterministic) { -#if CUDNN_VERSION >= 7001 - int actual_perf_count; - int best_algo_idx = 0; - std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( - args.handle, args.wdesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, - &actual_perf_count, perf_results.data())); - result.algo = perf_results[best_algo_idx].algo; - -#if CUDNN_VERSION < 7500 - int stride_dim = args.x->dims().size() - 2; - bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, - [=](int n) { return n != 1; }); - if (blacklist && (perf_results[best_algo_idx].algo == - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || - perf_results[best_algo_idx].algo == - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { - result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; - } -#endif - result.workspace_size = GetWorkspaceSize(args, result.algo); - if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 - // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results, workspace_size_limit, - &result); -#else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, args.wdesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.idesc.desc(), - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, args.wdesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.idesc.desc(), - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } else if (deterministic) { - result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + if (deterministic) { + result = FindAlgoDeterministic(); } else { - auto workspace_handle = ctx.cudnn_workspace_handle(); - auto x_dims = phi::vectorize(args.x->dims()); - auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); - size_t max_workspace_size = - FindMaxWorkspaceSize(args, workspace_size_limit); - VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) - << " MB"; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnFindConvolutionBackwardDataAlgorithmEx( - args.handle, args.wdesc.desc(), args.w->data(), - args.odesc.desc(), args.o->data(), - args.cdesc.desc(), args.idesc.desc(), - const_cast(args.x->data()), - kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, - perf_results.data(), cudnn_workspace_ptr, - max_workspace_size)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, - UseFixedWorkspace()); - - VLOG(3) << GetPerfResultString( - "[Exhaustive Search] BwdDataAlgo Perf result", perf_results, - returned_algo_count, workspace_size_limit); - result.time = perf_results[0].time; - return perf_results[0].algo; - }); + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, runn heuristic search(default) before + // auto-tune process, run exhaustive_search during mentioned process. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + size_t key = args.GetCacheKey(); + auto& cache = + phi::autotune::AutoTuneCache::Instance().GetConvBackwardData(); + if (cache.Find(key)) { + result.algo = static_cast(cache.Get(key)); + } else { + bool use_autotune = + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + result = FindAlgoExhaustiveSearch(args, ctx); + cache.Set(key, static_cast(result.algo)); + } else { + result = FindAlgoHeuristic(args, ctx); + } + } } VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search << ", deterministic=" << deterministic @@ -414,8 +396,106 @@ struct SearchAlgorithm { } private: - static size_t FindMaxWorkspaceSize(const ConvArgs& args, - size_t workspace_size_limit) { + static SearchResult FindAlgoDeterministic() { + return SearchResult(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1); + } + + static SearchResult FindAlgoHeuristic(const ConvArgs& args, + const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + +#if CUDNN_VERSION >= 7001 + int actual_perf_count; + int best_algo_idx = 0; + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + +#if CUDNN_VERSION < 7500 + int stride_dim = args.x->dims().size() - 2; + bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, + [=](int n) { return n != 1; }); + if (blacklist && (perf_results[best_algo_idx].algo == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || + perf_results[best_algo_idx].algo == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { + result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } +#endif + result.workspace_size = GetWorkspaceSize(args, result.algo); + if (result.workspace_size > workspace_size_limit) { +#if CUDNN_VERSION >= 8000 + // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); +#else + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << result.workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + } +#else + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + + return result; + } + + template + static SearchResult FindAlgoExhaustiveSearch( + const ConvArgs& args, const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; + + int returned_algo_count; + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); + auto cudnn_find_func = [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx( + args.handle, args.wdesc.desc(), args.w->data(), + args.odesc.desc(), args.o->data(), args.cdesc.desc(), + args.idesc.desc(), const_cast(args.x->data()), + kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, + perf_results.data(), workspace_ptr, max_workspace_size)); + }; + + auto workspace_handle = ctx.cudnn_workspace_handle(); + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] BwdDataAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); + + return result; + } + + static size_t GetMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { if (!UseFixedWorkspace()) { size_t max_workspace_size = 0; for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) { @@ -438,6 +518,10 @@ struct SearchAlgorithm { } }; +// cuDNN convution backward filter-algorithm searcher, consisted of three +// algorithm searching modes, namely: deterministic, heuristic, and +// exhaustive_search mode. As well as one workspace size acquirsition function +// with respect to the chosen alogrithm. template <> struct SearchAlgorithm { using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; @@ -450,113 +534,30 @@ struct SearchAlgorithm { platform::CUDAGraphCaptureModeGuard guard; SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - if (!exhaustive_search && !deterministic) { -#if CUDNN_VERSION >= 7001 - int actual_perf_count; - int best_algo_idx = 0; - std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, - &actual_perf_count, perf_results.data())); - result.algo = perf_results[best_algo_idx].algo; - result.workspace_size = perf_results[best_algo_idx].memory; - - if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 - // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results, workspace_size_limit, - &result); -#else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &(result.algo))); -#endif - } else if (deterministic) { - result.algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + if (deterministic) { + result = FindAlgoDeterministic(); } else { - auto workspace_handle = ctx.cudnn_workspace_handle(); - auto x_dims = phi::vectorize(args.x->dims()); - auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - - if (dtype != CUDNN_DATA_HALF) { - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); - size_t max_workspace_size = - FindMaxWorkspaceSize(args, workspace_size_limit); - VLOG(3) << "max_workspace_size=" - << ToMegaBytes(max_workspace_size) << " MB"; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnFindConvolutionBackwardFilterAlgorithmEx( - args.handle, args.idesc.desc(), args.x->data(), - args.odesc.desc(), args.o->data(), - args.cdesc.desc(), args.wdesc.desc(), - const_cast(args.w->data()), - kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - perf_results.data(), cudnn_workspace_ptr, - max_workspace_size)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, - UseFixedWorkspace()); - - VLOG(3) << GetPerfResultString( - "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, - returned_algo_count, workspace_size_limit); - result.time = perf_results[0].time; - return perf_results[0].algo; - }); + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, runn heuristic search(default) before + // auto-tune process, run exhaustive_search during mentioned process. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + size_t key = args.GetCacheKey(); + auto& cache = + phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter(); + if (cache.Find(key)) { + result.algo = static_cast(cache.Get(key)); } else { - result.algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - SearchResult algo_result; - int actual_algos = 0; - std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnFindConvolutionBackwardFilterAlgorithm( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), - perf_results.size(), &actual_algos, - perf_results.data())); - perf_results.resize(actual_algos); - ChooseAlgo(perf_results, workspace_size_limit, &algo_result); - result.time = algo_result.time; - return algo_result.algo; - }); + bool use_autotune = + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + result = FindAlgoExhaustiveSearch(args, ctx); + cache.Set(key, static_cast(result.algo)); + } else { + result = FindAlgoHeuristic(args, ctx); + } } } VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search @@ -578,8 +579,126 @@ struct SearchAlgorithm { } private: - static size_t FindMaxWorkspaceSize(const ConvArgs& args, - size_t workspace_size_limit) { + static SearchResult FindAlgoDeterministic() { + return SearchResult(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1); + } + + static SearchResult FindAlgoHeuristic(const ConvArgs& args, + const phi::GPUContext& ctx) { + SearchResult result; + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + +#if CUDNN_VERSION >= 7001 + int actual_perf_count; + int best_algo_idx = 0; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; + + if (result.workspace_size > workspace_size_limit) { +#if CUDNN_VERSION >= 8000 + // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); +#else + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << result.workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + } +#else + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &(result.algo))); +#endif + + return result; + } + + template + static SearchResult FindAlgoExhaustiveSearch( + const ConvArgs& args, const phi::GPUContext& ctx) { + SearchResult result; + int returned_algo_count = 0; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + auto workspace_handle = ctx.cudnn_workspace_handle(); + if (platform::CudnnDataType::type != CUDNN_DATA_HALF) { + size_t max_workspace_size = + GetMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; + + auto cudnn_find_func = [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx( + args.handle, args.idesc.desc(), args.x->data(), + args.odesc.desc(), args.o->data(), args.cdesc.desc(), + args.wdesc.desc(), const_cast(args.w->data()), + kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, + perf_results.data(), workspace_ptr, max_workspace_size)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); + } else { + int max_algos = GetAlgorithmMaxCount(args.handle); + std::vector perf_results(max_algos); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), perf_results.size(), + &returned_algo_count, perf_results.data())); + perf_results.resize(returned_algo_count); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + perf_results.size(), workspace_size_limit); + ChooseAlgo(perf_results, workspace_size_limit, &result); + } + + return result; + } + + static int GetAlgorithmMaxCount(cudnnHandle_t handle) { +#if CUDNN_VERSION_MIN(7, 0, 1) + int max_algos = 0; + auto status = + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + handle, &max_algos); + if (status == gpuSuccess) { + VLOG(5) << "[BackwardFilter] max_algos: predefined=" + << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos; + return max_algos; + } +#endif + return kNUM_CUDNN_BWD_FILTER_ALGS; + } + + static size_t GetMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { if (!UseFixedWorkspace()) { size_t max_workspace_size = 0; for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) { @@ -604,10 +723,6 @@ struct SearchAlgorithm { static void ChooseAlgo(const std::vector& perf_results, size_t workspace_limit, SearchResult* algo_result) { - VLOG(3) << GetPerfResultString( - "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, - perf_results.size(), workspace_limit); - for (size_t i = 0; i != perf_results.size(); ++i) { const auto& result = perf_results[i]; if (result.status == CUDNN_STATUS_SUCCESS && diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 18b53563cd..a43eaa41cf 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -774,3 +774,12 @@ DEFINE_bool(enable_ins_parser_file, false, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); #endif + +/** + * Autotune related FLAG + * Name: FLAGS_use_autotune + * Since Version: 2.3.0 + * Value Range: bool, default=false + * Example: + */ +PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune."); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 982bf76461..45fcd2fad9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -4469,7 +4469,7 @@ All parameter, weight, gradient are variables in Paddle. return phi::autotune::AutoTuneStatus::Instance().DisableAutoTune(); }); - m.def("autotune_range", [](int64_t start, int64_t stop) { + m.def("set_autotune_range", [](int64_t start, int64_t stop) { return phi::autotune::AutoTuneStatus::Instance().SetAutoTuneRange(start, stop); }); @@ -4478,10 +4478,8 @@ All parameter, weight, gradient are variables in Paddle. [] { return phi::autotune::AutoTuneStatus::Instance().Update(); }); m.def("autotune_status", [] { - phi::autotune::AutoTuneCache::Instance().UpdateStatus(); py::dict res; - res["use_autotune"] = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + phi::autotune::AutoTuneCache::Instance().UpdateStatus(); res["step_id"] = phi::autotune::AutoTuneStatus::Instance().StepID(); res["cache_size"] = phi::autotune::AutoTuneCache::Instance().Size(); res["cache_hit_rate"] = diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 937024d450..eec83a1ed8 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -6,12 +6,15 @@ file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h # phi functors and functions called by kernels add_subdirectory(funcs) +# kernel autotune +add_subdirectory(autotune) + # phi depends all phi kernel targets set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor ) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) @@ -27,12 +30,16 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel +set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel conv_transpose_kernel conv_transpose_grad_kernel) +set(MANUAL_BUILD_KERNELS ${AUTOTUNE_KERNELS} cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) +foreach(src ${AUTOTUNE_KERNELS}) + kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune) +endforeach() kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy) @@ -75,6 +82,3 @@ add_subdirectory(selected_rows) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) # For strings kernels add_subdirectory(strings) - -# 5. kernel autotune -add_subdirectory(autotune) diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index f1702d883b..63dc224594 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -1,11 +1,12 @@ if (WITH_GPU) - nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) + nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) + nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) elseif (WITH_ROCM) - hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) + hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) + hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) endif() cc_library(cache SRCS cache.cc DEPS boost) +cc_library(switch_autotune SRCS switch_autotune.cc DEPS cache flags) cc_test(cache_test SRCS cache_test.cc DEPS gtest cache) diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc index bf68e20101..ef2cbe633d 100644 --- a/paddle/phi/kernels/autotune/cache.cc +++ b/paddle/phi/kernels/autotune/cache.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/autotune/cache.h" +#include +#include "glog/logging.h" namespace phi { namespace autotune { @@ -32,5 +34,40 @@ size_t ConvKey(const std::vector& x_dims, static_cast(dtype)); } +std::string AlgorithmTypeString(int64_t algo_type) { + if (algo_type == static_cast(AlgorithmType::kConvForward)) { + return "conv_forward"; + } else if (algo_type == + static_cast(AlgorithmType::kConvBackwardData)) { + return "conv_backward_data"; + } else if (algo_type == + static_cast(AlgorithmType::kConvBackwardFilter)) { + return "conv_backward_filter"; + } + return std::to_string(algo_type); +} + +void AutoTuneCache::UpdateStatus() { + int64_t size = 0; + int64_t cache_hits = 0; + int64_t cache_misses = 0; + int name_width = 24; + std::cout.setf(std::ios::left); + for (auto& v : auto_tune_map_) { + VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width) + << AlgorithmTypeString(v.first) + << " Cache Size: " << v.second.Size() + << " Hits: " << v.second.CacheHits() + << " Misses: " << v.second.CacheMisses() + << " Hit Rate: " << v.second.CacheHitRate(); + size += v.second.Size(); + cache_hits += v.second.CacheHits(); + cache_misses += v.second.CacheMisses(); + } + total_size_ = size; + total_cache_hits_ = cache_hits; + total_cache_misses_ = cache_misses; +} + } // namespace autotune } // namespace phi diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index d492e7c151..37c5d134e8 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -13,11 +13,12 @@ // limitations under the License. #pragma once + #include #include +#include #include #include -#include "glog/logging.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" @@ -92,6 +93,13 @@ class AlgorithmsCache { return ret; } + void Clean() { + std::lock_guard lock(*cache_mutex_); + hash_.clear(); + cache_hits_ = 0; + cache_misses_ = 0; + } + void Set(size_t key, AlgorithmT algo) { std::lock_guard lock(*cache_mutex_); hash_[key] = algo; @@ -116,15 +124,22 @@ class AlgorithmsCache { private: std::unordered_map hash_; std::shared_ptr cache_mutex_; - int64_t cache_hits_ = 0; - int64_t cache_misses_ = 0; + + int64_t cache_hits_{0}; + int64_t cache_misses_{0}; +}; + +enum class AlgorithmType { + kConvForward = 1, + kConvBackwardData = 2, + kConvBackwardFilter = 3, + kAlgorithmCount = 4 }; // AlgorithmsConfigKey -> AlgorithmsID -using AlgorithmsConfigKeyMap = AlgorithmsCache; -// AlgorithmsType -> AlgorithmsCache -using AlgorithmsTypeMap = - std::unordered_map; +using AlgorithmsCacheMap = AlgorithmsCache; +// AlgorithmType -> AlgorithmsCache +using AlgorithmsTypeMap = std::unordered_map; class AutoTuneCache { public: @@ -133,42 +148,30 @@ class AutoTuneCache { return autotune_cache; } - AlgorithmsConfigKeyMap& RegisterOrGet(const std::string& algo_type) { - std::lock_guard lock(*autotune_cache_mutex_); - if (auto_tune_map_.find(algo_type) == auto_tune_map_.end()) { - AlgorithmsConfigKeyMap cache; - auto_tune_map_[algo_type] = cache; - } - return auto_tune_map_[algo_type]; + AlgorithmsCacheMap& Get(const AlgorithmType& algo_type) { + return auto_tune_map_[static_cast(algo_type)]; } - void Clean(float miss_rate) { - std::lock_guard lock(*autotune_cache_mutex_); - // Set a small tolerance to avoid performance degradation - // due to large cache size under dynamic shape. - if (miss_rate > 0.01) { - auto_tune_map_.clear(); - } + AlgorithmsCacheMap& GetConvForward() { + return Get(AlgorithmType::kConvForward); + } + + AlgorithmsCacheMap& GetConvBackwardData() { + return Get(AlgorithmType::kConvBackwardData); + } + + AlgorithmsCacheMap& GetConvBackwardFilter() { + return Get(AlgorithmType::kConvBackwardFilter); } - void UpdateStatus() { - int64_t size = 0; - int64_t cache_hits = 0; - int64_t cache_misses = 0; + void Clean() { for (auto& v : auto_tune_map_) { - VLOG(4) << "AlgoType: " << v.first << " Cache Size: " << v.second.Size() - << " Hits: " << v.second.CacheHits() - << " Misses: " << v.second.CacheMisses() - << " Hit Rate: " << v.second.CacheHitRate(); - size += v.second.Size(); - cache_hits += v.second.CacheHits(); - cache_misses += v.second.CacheMisses(); + v.second.Clean(); } - total_size_ = size; - total_cache_hits_ = cache_hits; - total_cache_misses_ = cache_misses; } + void UpdateStatus(); + // The number of total config cached int64_t Size() const { return total_size_; } @@ -183,17 +186,30 @@ class AutoTuneCache { total_cache_hit_rate = static_cast(total_cache_hits_) / static_cast(total_num_accesses); } - return total_cache_hit_rate; } private: - AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) {} + AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) { + for (int i = 1; i < static_cast(AlgorithmType::kAlgorithmCount); ++i) { + Register(static_cast(i)); + } + } + + void Register(const AlgorithmType& algo_type) { + std::lock_guard lock(*autotune_cache_mutex_); + int64_t key = static_cast(algo_type); + if (auto_tune_map_.find(key) == auto_tune_map_.end()) { + AlgorithmsCacheMap cache; + auto_tune_map_[key] = cache; + } + } + AlgorithmsTypeMap auto_tune_map_; std::shared_ptr autotune_cache_mutex_; - int64_t total_cache_hits_ = 0; - int64_t total_cache_misses_ = 0; - int64_t total_size_ = 0; + int64_t total_cache_hits_{0}; + int64_t total_cache_misses_{0}; + int64_t total_size_{0}; }; } // namespace autotune diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc index 92ba411624..f99f8bfc8b 100644 --- a/paddle/phi/kernels/autotune/cache_test.cc +++ b/paddle/phi/kernels/autotune/cache_test.cc @@ -22,7 +22,7 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 }; TEST(AlgosCache, AlgosCache) { auto autotune_cache = phi::autotune::AutoTuneCache::Instance(); - auto& cache = autotune_cache.RegisterOrGet("conv_fw"); + auto& cache = autotune_cache.GetConvForward(); std::vector x_shape = {4, 224, 224, 3}; std::vector w_shape = {32, 3, 3, 3}; diff --git a/paddle/phi/kernels/autotune/switch_autotune.cc b/paddle/phi/kernels/autotune/switch_autotune.cc new file mode 100644 index 0000000000..6fda24ef3c --- /dev/null +++ b/paddle/phi/kernels/autotune/switch_autotune.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/autotune/switch_autotune.h" + +#include "gflags/gflags.h" +#include "glog/logging.h" + +DECLARE_bool(use_autotune); + +namespace phi { +namespace autotune { + +void AutoTuneStatus::EnableAutoTune() { + FLAGS_use_autotune = true; + Init(); +} + +void AutoTuneStatus::DisableAutoTune() { + FLAGS_use_autotune = false; + Init(); +} + +void AutoTuneStatus::Update() { + current_steps_id_ += 1; + if (!FLAGS_use_autotune) { + return; + } + + // This fuction is called when each iter finished. + if (current_steps_id_ + 1 < start_step_id_) { + use_autotune_ = false; + } else if (current_steps_id_ + 1 >= start_step_id_ && + current_steps_id_ + 1 < stop_step_id_) { + use_autotune_ = true; + AutoTuneCache::Instance().UpdateStatus(); + step_hit_rates_.push_back(StepHitRate()); + VLOG(3) << "Step ID: " << current_steps_id_ + << ", Accumulative Cache Hit Rate: " + << static_cast(AutoTuneCache::Instance().CacheHitRate() * 100) + << "%, Cache Size: " << AutoTuneCache::Instance().Size() + << ", Current Step Hit Rate: " + << static_cast(StepHitRate() * 100) << "%"; + } else { + use_autotune_ = false; + // Set a small tolerance to avoid performance degradation + // due to large cache size under dynamic shape. + // TODO(limingshu): Currently works for conv op only, this + // method shall be opimized when more ops involved in. + // float miss_rate = static_cast(1) - RecentHitRate(); + // if (current_steps_id_ == stop_step_id_) { + // AutoTuneCache::Instance().Clean(miss_rate); + // } + if (VLOG_IS_ON(4)) { + AutoTuneCache::Instance().UpdateStatus(); + VLOG(4) << "Step ID: " << current_steps_id_ << ", Current Step Hit Rate: " + << static_cast(StepHitRate() * 100) << "%"; + } + } +} + +} // namespace autotune +} // namespace phi diff --git a/paddle/phi/kernels/autotune/switch_autotune.h b/paddle/phi/kernels/autotune/switch_autotune.h index 2f9621ed20..1793940542 100644 --- a/paddle/phi/kernels/autotune/switch_autotune.h +++ b/paddle/phi/kernels/autotune/switch_autotune.h @@ -13,10 +13,8 @@ // limitations under the License. #pragma once + #include -#include -#include -#include "glog/logging.h" #include "paddle/phi/kernels/autotune/cache.h" namespace phi { @@ -31,45 +29,11 @@ class AutoTuneStatus { bool UseAutoTune() { return use_autotune_; } - // EnableAutoTune and DisableAutoTune Should be used for debug only. - void EnableAutoTune() { - use_autotune_ = true; - Init(); - } - - void DisableAutoTune() { - use_autotune_ = false; - Init(); - } + // EnableAutoTune and DisableAutoTune should be used for debug only. + void EnableAutoTune(); + void DisableAutoTune(); - void Update() { - current_steps_id_ += 1; - - if (!use_autotune_ && !update_use_autotune_) { - return; - } - - if (current_steps_id_ < start_step_id_) { - use_autotune_ = false; - } else if (current_steps_id_ >= start_step_id_ && - current_steps_id_ < stop_step_id_) { - use_autotune_ = true; - AutoTuneCache::Instance().UpdateStatus(); - step_hit_rates_.push_back(StepHitRate()); - VLOG(3) << "Step ID " << current_steps_id_ - << ", Accumulative Cache Hit Rate: " - << AutoTuneCache::Instance().CacheHitRate() - << ", Cache Size: " << AutoTuneCache::Instance().Size() - << ", Current Step Hit Rate: " << StepHitRate(); - } else if (current_steps_id_ == stop_step_id_) { - use_autotune_ = false; - update_use_autotune_ = false; - // clean cache according miss rate - float miss_rate = static_cast(1) - RecentHitRate(); - AutoTuneCache::Instance().Clean(miss_rate); - VLOG(3) << "Recent Miss Rate: " << miss_rate; - } - } + void Update(); int64_t StepID() { return current_steps_id_; } @@ -84,19 +48,25 @@ class AutoTuneStatus { // Hit Rate of Current Step float StepHitRate() { - int64_t current_hits = AutoTuneCache::Instance().CacheHits(); - int64_t current_misses = AutoTuneCache::Instance().CacheMisses(); - int64_t step_hits_ = current_hits - previous_hits_; - int64_t step_misses_ = current_misses - previous_misses_; - float step_hit_rate = 0.; - int64_t step_num_accesses = step_hits_ + step_misses_; - if (step_num_accesses != 0) { - step_hit_rate = static_cast(step_hits_) / - static_cast(step_num_accesses); + static int64_t last_step_id = -2; + + if (last_step_id != current_steps_id_) { + int64_t current_hits = AutoTuneCache::Instance().CacheHits(); + int64_t current_misses = AutoTuneCache::Instance().CacheMisses(); + int64_t step_hits_ = current_hits - previous_hits_; + int64_t step_misses_ = current_misses - previous_misses_; + float step_hit_rate = 0.; + int64_t step_num_accesses = step_hits_ + step_misses_; + if (step_num_accesses != 0) { + step_hit_rate = static_cast(step_hits_) / + static_cast(step_num_accesses); + } + previous_hits_ = current_hits; + previous_misses_ = current_misses; + current_step_hit_rate_ = step_hit_rate; + last_step_id = current_steps_id_; } - previous_hits_ = current_hits; - previous_misses_ = current_misses; - return step_hit_rate; + return current_step_hit_rate_; } void SetAutoTuneRange(int64_t start, int64_t stop) { @@ -108,21 +78,21 @@ class AutoTuneStatus { AutoTuneStatus() = default; void Init() { - update_use_autotune_ = use_autotune_; + use_autotune_ = false; current_steps_id_ = -1; previous_hits_ = 0; previous_misses_ = 0; step_hit_rates_.clear(); - AutoTuneCache::Instance().Clean(1.0); + AutoTuneCache::Instance().Clean(); } - int64_t start_step_id_ = 0; - int64_t stop_step_id_ = 10; - int64_t current_steps_id_ = -1; - bool use_autotune_ = false; - bool update_use_autotune_ = false; - int64_t previous_hits_ = 0; - int64_t previous_misses_ = 0; + bool use_autotune_{false}; + int64_t start_step_id_{1}; + int64_t stop_step_id_{10}; + int64_t current_steps_id_{-1}; + int64_t previous_hits_{0}; + int64_t previous_misses_{0}; + float current_step_hit_rate_{0.f}; std::vector step_hit_rates_; }; diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py index 1c08811d4b..1775272aac 100644 --- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py @@ -14,7 +14,7 @@ import paddle import unittest -import numpy +import numpy as np class SimpleNet(paddle.nn.Layer): @@ -27,6 +27,7 @@ class SimpleNet(paddle.nn.Layer): def train_dygraph(net, data): + data.stop_gradient = False out = net(data) loss = paddle.mean(out) adam = paddle.optimizer.Adam(parameters=net.parameters()) @@ -36,6 +37,7 @@ def train_dygraph(net, data): def static_program(net, data): + data.stop_gradient = False out = net(data) loss = paddle.mean(out) adam = paddle.optimizer.Adam() @@ -43,60 +45,64 @@ def static_program(net, data): return loss -def set_flags(enable_autotune): - if paddle.is_compiled_with_cuda(): - if enable_autotune: - paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1}) - paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 1}) - else: - paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512}) - paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 0}) - - class TestAutoTune(unittest.TestCase): + def set_flags(self, enable_autotune): + if paddle.is_compiled_with_cuda(): + if enable_autotune: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1}) + else: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512}) + + def get_flags(self, name): + res = paddle.get_flags(name) + return res[name] + + def get_expected_res(self, step_id, enable_autotune): + expected_res = { + "step_id": step_id, + "cache_size": 0, + "cache_hit_rate": 0 + } + if paddle.is_compiled_with_cuda(): + # Total 3 * num_iters cache accesses, only iter 2 hits the cache. + if enable_autotune and step_id >= 1: + expected_res["cache_size"] = 3 + if enable_autotune and step_id == 2: + expected_res["cache_hit_rate"] = np.round( + float(3) / float(9), 5) + return expected_res + def test_autotune(self): paddle.fluid.core.disable_autotune() - status = paddle.fluid.core.autotune_status() - self.assertEqual(status["use_autotune"], False) + self.assertEqual(self.get_flags("FLAGS_use_autotune"), False) paddle.fluid.core.enable_autotune() - status = paddle.fluid.core.autotune_status() - self.assertEqual(status["use_autotune"], True) + self.assertEqual(self.get_flags("FLAGS_use_autotune"), True) def check_status(self, expected_res): status = paddle.fluid.core.autotune_status() for key in status.keys(): - self.assertEqual(status[key], expected_res[key]) + if key == "cache_hit_rate": + v = np.round(status[key], 5) + else: + v = status[key] + self.assertEqual(v, expected_res[key]) class TestDygraphAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): - set_flags(enable_autotune) + self.set_flags(enable_autotune) if enable_autotune: paddle.fluid.core.enable_autotune() else: paddle.fluid.core.disable_autotune() - paddle.fluid.core.autotune_range(1, 2) + paddle.fluid.core.set_autotune_range(1, 2) x_var = paddle.uniform((1, 1, 8, 8), dtype='float32', min=-1., max=1.) net = SimpleNet() for i in range(3): train_dygraph(net, x_var) - if i >= 1 and i < 2: - expected_res = { - "step_id": i, - "use_autotune": enable_autotune, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) - else: - expected_res = { - "step_id": i, - "use_autotune": False, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) + expected_res = self.get_expected_res(i, enable_autotune) + self.check_status(expected_res) def func_enable_autotune(self): self.run_program(enable_autotune=True) @@ -118,60 +124,45 @@ class TestDygraphAutoTuneStatus(TestAutoTune): class TestStaticAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): paddle.enable_static() - set_flags(enable_autotune) - if enable_autotune: - paddle.fluid.core.enable_autotune() - else: - paddle.fluid.core.disable_autotune() - paddle.fluid.core.autotune_range(1, 2) data_shape = [1, 1, 8, 8] - data = paddle.static.data(name='X', shape=data_shape, dtype='float32') - net = SimpleNet() - loss = static_program(net, data) + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + data = paddle.static.data( + name='X', shape=data_shape, dtype='float32') + net = SimpleNet() + loss = static_program(net, data) place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda( ) else paddle.CPUPlace() exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) - x = numpy.random.random(size=data_shape).astype('float32') + exe.run(startup_program) + x = np.random.random(size=data_shape).astype('float32') + + self.set_flags(enable_autotune) + if enable_autotune: + paddle.fluid.core.enable_autotune() + else: + paddle.fluid.core.disable_autotune() + paddle.fluid.core.set_autotune_range(1, 2) for i in range(3): - exe.run(feed={'X': x}, fetch_list=[loss]) + exe.run(program=main_program, feed={'X': x}, fetch_list=[loss]) status = paddle.fluid.core.autotune_status() - # In static mode, the startup_program will run at first. - # The expected step_id will be increased by 1. - if i >= 0 and i < 1: - expected_res = { - "step_id": i + 1, - "use_autotune": enable_autotune, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) - else: - expected_res = { - "step_id": i + 1, - "use_autotune": False, - "cache_size": 0, - "cache_hit_rate": 0 - } - self.check_status(expected_res) + expected_res = self.get_expected_res(i, enable_autotune) + self.check_status(expected_res) paddle.disable_static() def func_enable_autotune(self): self.run_program(enable_autotune=True) def test_enable_autotune(self): - with paddle.fluid.framework._test_eager_guard(): - self.func_enable_autotune() self.func_enable_autotune() def func_disable_autotune(self): self.run_program(enable_autotune=False) def test_disable_autotune(self): - with paddle.fluid.framework._test_eager_guard(): - self.func_disable_autotune() self.func_disable_autotune() -- GitLab