From 3a5b5048e6c904191a9324b70e759ad1f3e621e7 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 14 Sep 2022 10:48:55 +0800 Subject: [PATCH] Simplify the codes of conv. (#45966) --- paddle/fluid/operators/conv_base_helper.h | 4 - paddle/fluid/operators/conv_cudnn_helper.h | 260 ++++++++------------ paddle/fluid/operators/conv_miopen_helper.h | 3 + paddle/phi/kernels/autotune/cache.h | 4 + 4 files changed, 104 insertions(+), 167 deletions(-) diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h index b52936c1972..285dc8fddb7 100644 --- a/paddle/fluid/operators/conv_base_helper.h +++ b/paddle/fluid/operators/conv_base_helper.h @@ -36,10 +36,6 @@ using framework::ConvSearchCache; template using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; -// As the basic for SearchAlgorithm struct. -template -struct SearchAlgorithm {}; - // As the container of searchAlgorithm::Find() result. template struct SearchResult { diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index e6fcf2be286..d912149eace 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -146,83 +146,19 @@ void ChooseAlgoByWorkspace(const std::vector& perf_results, } } -static void SetConvMathType(const phi::GPUContext& ctx, - cudnnDataType_t dtype, - const platform::ConvolutionDescriptor& cdesc) { -#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_TENSOR_OP_MATH)); - VLOG(5) << "use cudnn_tensor_op_math"; -#if CUDA_VERSION >= 11000 -#if CUDNN_VERSION_MIN(8, 1, 0) - } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_TENSOR_OP_MATH)); -#endif // CUDNN_VERSION_MIN(8, 1, 0) - } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_FMA_MATH)); -#endif // CUDA_VERSION >= 11000 - } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; - } -#endif -} +template +struct SearchAlgorithmBase {}; // cuDNN convolution forward algorithm searcher, consisted of three searching // modes, namely: deterministic, heuristic and exhaustive_search mode. // As well as one workspace size acquirsition function with respect to // the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionFwdAlgoPerf_t; using AlgoT = cudnnConvolutionFwdAlgo_t; - - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(args); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - auto key = args.Convert2ConvCacheKey(); - auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward(); - if (cache.Find(key)) { - auto t = cache.Get(key); - result.algo = static_cast(t.algo); - result.workspace_size = t.workspace_size; - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - } else { - result = FindAlgoHeuristic(args, ctx); - } - phi::autotune::DnnNode node(static_cast(result.algo), - result.workspace_size); - cache.Set(key, node); - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo - << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; - return result; - } + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvForward; static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionFwdAlgo_t algo) { @@ -239,7 +175,7 @@ struct SearchAlgorithm { return workspace_size; } - private: + protected: static SearchResult FindAlgoDeterministic(const ConvArgs& args) { auto workspace_size = GetWorkspaceSize(args, static_cast(1)); return SearchResult(static_cast(1), -1.0, workspace_size); @@ -271,6 +207,10 @@ struct SearchAlgorithm { if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 + VLOG(4) << GetPerfResultString("[Heuristic] FwdAlgo Perf result", + perf_results, + actual_perf_count, + workspace_size_limit); // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); @@ -387,53 +327,11 @@ struct SearchAlgorithm { // As well as one workspace size acquirsition function with // respect to the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; using AlgoT = cudnnConvolutionBwdDataAlgo_t; - - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(args); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - auto key = args.Convert2ConvCacheKey(); - auto& cache = - phi::autotune::AutoTuneCache::Instance().GetConvBackwardData(); - if (cache.Find(key)) { - auto t = cache.Get(key); - result.algo = static_cast(t.algo); - result.workspace_size = t.workspace_size; - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - } else { - result = FindAlgoHeuristic(args, ctx); - } - phi::autotune::DnnNode node(static_cast(result.algo), - result.workspace_size); - cache.Set(key, node); - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo - << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; - return result; - } + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvBackwardData; static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionBwdDataAlgo_t algo) { @@ -450,7 +348,7 @@ struct SearchAlgorithm { return workspace_size; } - private: + protected: static SearchResult FindAlgoDeterministic(const ConvArgs& args) { auto workspace_size = GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1); @@ -609,54 +507,11 @@ struct SearchAlgorithm { // exhaustive_search mode. As well as one workspace size acquirsition function // with respect to the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; using AlgoT = cudnnConvolutionBwdFilterAlgo_t; - - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - platform::CUDAGraphCaptureModeGuard guard; - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(args); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - auto key = args.Convert2ConvCacheKey(); - auto& cache = - phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter(); - if (cache.Find(key)) { - auto t = cache.Get(key); - result.algo = static_cast(t.algo); - result.workspace_size = t.workspace_size; - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - } else { - result = FindAlgoHeuristic(args, ctx); - } - phi::autotune::DnnNode node(static_cast(result.algo), - result.workspace_size); - cache.Set(key, node); - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo - << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; - return result; - } + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvBackwardFilter; static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionBwdFilterAlgo_t algo) { @@ -674,7 +529,7 @@ struct SearchAlgorithm { return workspace_size; } - private: + protected: static SearchResult FindAlgoDeterministic(const ConvArgs& args) { auto workspace_size = GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1); @@ -891,5 +746,84 @@ struct SearchAlgorithm { } }; +template +struct SearchAlgorithm : public SearchAlgorithmBase { + using AlgoT = typename SearchAlgorithmBase::AlgoT; + + template + static SearchResult Find(const ConvArgs& args, + bool exhaustive_search, + bool deterministic, + const phi::GPUContext& ctx) { + SearchResult result; + auto dtype = platform::CudnnDataType::type; + SetConvMathType(ctx, dtype, args.cdesc); + + if (deterministic) { + result = SearchAlgorithmBase::FindAlgoDeterministic(args); + } else { + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, runn heuristic search(default) before + // auto-tune process, run exhaustive_search during mentioned process. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + auto key = args.Convert2ConvCacheKey(); + auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv( + SearchAlgorithmBase::kAlgoType); + if (cache.Find(key)) { + auto t = cache.Get(key); + result.algo = static_cast(t.algo); + result.workspace_size = t.workspace_size; + } else { + bool use_autotune = + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + result = + SearchAlgorithmBase::template FindAlgoExhaustiveSearch( + args, ctx); + } else { + result = SearchAlgorithmBase::FindAlgoHeuristic(args, ctx); + } + phi::autotune::DnnNode node(static_cast(result.algo), + result.workspace_size); + cache.Set(key, node); + } + } + VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo + << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; + return result; + } + + static void SetConvMathType(const phi::GPUContext& ctx, + cudnnDataType_t dtype, + const platform::ConvolutionDescriptor& cdesc) { +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "Enable Tensor Core for FLOAT16"; +#if CUDA_VERSION >= 11000 +#if CUDNN_VERSION_MIN(8, 1, 0) + } else if (ctx.GetComputeCapability() >= 80 && + dtype == CUDNN_DATA_BFLOAT16) { + VLOG(5) << "Enable Tensor Core for BFLOAT16"; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_TENSOR_OP_MATH)); +#endif // CUDNN_VERSION_MIN(8, 1, 0) + } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { + VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT"; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_FMA_MATH)); +#endif // CUDA_VERSION >= 11000 + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_DEFAULT_MATH)); + } +#endif + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 1e78dcb6b73..dbe03886b5d 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -55,6 +55,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context, out_t.device(place) = in_t.slice(offsets, extents); } +template +struct SearchAlgorithm {}; + template <> struct SearchAlgorithm { using perf_t = miopenConvAlgoPerf_t; diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index aacebc66570..949cae1532b 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -289,6 +289,10 @@ class AutoTuneCache { return auto_tune_map_[static_cast(algo_type)]; } + CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) { + return cudnn_auto_tune_map_[static_cast(algo_type)]; + } + CudnnAlgorithmsCacheMap& GetConvForward() { return cudnn_auto_tune_map_[static_cast( AlgorithmType::kConvForward)]; -- GitLab