From c0ed87296c3a09a1afc61effae736597dd70d350 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 20 Oct 2022 21:24:02 +0800 Subject: [PATCH] [Cherry-pick] Simplify conv codes and fix cache and autotune bugs. (#47197) * Simplify the codes of conv. (#45966) * Enable to record whether the conv algo is got by exhaustive search to fix autotune cache bug. (#47065) --- paddle/fluid/operators/conv_base_helper.h | 20 +- paddle/fluid/operators/conv_cudnn_helper.h | 290 ++++++++---------- paddle/fluid/operators/conv_miopen_helper.h | 3 + paddle/phi/kernels/autotune/cache.h | 113 ++++--- paddle/phi/kernels/autotune/cache_test.cc | 10 +- .../kernels/gpudnn/conv_grad_grad_kernel.cu | 8 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 4 +- paddle/phi/kernels/gpudnn/conv_kernel.cu | 6 +- .../gpudnn/conv_transpose_grad_kernel.cu | 12 +- .../kernels/gpudnn/conv_transpose_kernel.cu | 2 +- tools/check_op_benchmark_result.py | 14 +- 11 files changed, 222 insertions(+), 260 deletions(-) diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h index b52936c1972..4dc83c9717a 100644 --- a/paddle/fluid/operators/conv_base_helper.h +++ b/paddle/fluid/operators/conv_base_helper.h @@ -36,17 +36,10 @@ using framework::ConvSearchCache; template using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; -// As the basic for SearchAlgorithm struct. -template -struct SearchAlgorithm {}; - // As the container of searchAlgorithm::Find() result. template struct SearchResult { SearchResult() {} - explicit SearchResult(const phi::autotune::DnnNode& node) - : algo(static_cast(node.algo)), - workspace_size(node.workspace_size) {} explicit SearchResult(AlgoT a) : algo(a) {} explicit SearchResult(AlgoT a, float t, size_t size) @@ -55,12 +48,21 @@ struct SearchResult { AlgoT algo = static_cast(0); float time = -1.f; size_t workspace_size = 0; + bool exhaustive_search = false; }; template static std::ostream& operator<<(std::ostream& out, const std::vector& v) { out << "["; - for (auto const& tmp : v) out << tmp << ","; + bool is_first = true; + for (auto const& tmp : v) { + if (is_first) { + out << tmp; + is_first = false; + } else { + out << ", " << tmp; + } + } out << "]"; return out; } @@ -113,7 +115,7 @@ struct ConvArgsBase { auto w_shape = phi::vectorize(w->dims()); VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape << ", strides=" << s << ", paddings=" << p << ", dilations=" << d - << ",data= " << paddle::experimental::CppTypeToDataType::Type() + << ", data=" << paddle::experimental::CppTypeToDataType::Type() << ", group=" << group << ", data layout=" << static_cast(data_layout); diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 1df5df6ac50..52c530d71f0 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -14,12 +14,11 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/conv_base_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/profiler.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace paddle { @@ -53,11 +52,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context, } auto in_t = - framework::EigenTensor::From( - *input); - auto out_t = - framework::EigenTensor::From( - *out, new_out_dims); + phi::EigenTensor::From(*input); + auto out_t = phi::EigenTensor::From( + *out, new_out_dims); phi::funcs::EigenSlice, T, D>::Eval( place, out_t, in_t, offsets, extents); @@ -147,83 +144,21 @@ void ChooseAlgoByWorkspace(const std::vector& perf_results, } } -static void SetConvMathType(const phi::GPUContext& ctx, - cudnnDataType_t dtype, - const platform::ConvolutionDescriptor& cdesc) { -#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_TENSOR_OP_MATH)); - VLOG(5) << "use cudnn_tensor_op_math"; -#if CUDA_VERSION >= 11000 -#if CUDNN_VERSION_MIN(8, 1, 0) - } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_TENSOR_OP_MATH)); -#endif // CUDNN_VERSION_MIN(8, 1, 0) - } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_FMA_MATH)); -#endif // CUDA_VERSION >= 11000 - } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; - } -#endif -} +template +struct SearchAlgorithmBase {}; // cuDNN convolution forward algorithm searcher, consisted of three searching // modes, namely: deterministic, heuristic and exhaustive_search mode. // As well as one workspace size acquirsition function with respect to // the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionFwdAlgoPerf_t; using AlgoT = cudnnConvolutionFwdAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvForward; - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(args); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - auto key = args.Convert2ConvCacheKey(); - auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward(); - if (cache.Find(key)) { - auto t = cache.Get(key); - result.algo = static_cast(t.algo); - result.workspace_size = t.workspace_size; - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - } else { - result = FindAlgoHeuristic(args, ctx); - } - phi::autotune::DnnNode node(static_cast(result.algo), - result.workspace_size); - cache.Set(key, node); - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo - << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; - return result; - } + static const std::string GetPerfName() { return "ConvForward"; } static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionFwdAlgo_t algo) { @@ -240,7 +175,7 @@ struct SearchAlgorithm { return workspace_size; } - private: + protected: static SearchResult FindAlgoDeterministic(const ConvArgs& args) { auto workspace_size = GetWorkspaceSize(args, static_cast(1)); return SearchResult(static_cast(1), -1.0, workspace_size); @@ -272,6 +207,10 @@ struct SearchAlgorithm { if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 + VLOG(4) << GetPerfResultString("[Heuristic] FwdAlgo Perf result", + perf_results, + actual_perf_count, + workspace_size_limit); // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); @@ -388,53 +327,13 @@ struct SearchAlgorithm { // As well as one workspace size acquirsition function with // respect to the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; using AlgoT = cudnnConvolutionBwdDataAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvBackwardData; - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(args); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - auto key = args.Convert2ConvCacheKey(); - auto& cache = - phi::autotune::AutoTuneCache::Instance().GetConvBackwardData(); - if (cache.Find(key)) { - auto t = cache.Get(key); - result.algo = static_cast(t.algo); - result.workspace_size = t.workspace_size; - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - } else { - result = FindAlgoHeuristic(args, ctx); - } - phi::autotune::DnnNode node(static_cast(result.algo), - result.workspace_size); - cache.Set(key, node); - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo - << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; - return result; - } + static const std::string GetPerfName() { return "ConvBackwardData"; } static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionBwdDataAlgo_t algo) { @@ -451,7 +350,7 @@ struct SearchAlgorithm { return workspace_size; } - private: + protected: static SearchResult FindAlgoDeterministic(const ConvArgs& args) { auto workspace_size = GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1); @@ -610,54 +509,13 @@ struct SearchAlgorithm { // exhaustive_search mode. As well as one workspace size acquirsition function // with respect to the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; using AlgoT = cudnnConvolutionBwdFilterAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvBackwardFilter; - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - platform::CUDAGraphCaptureModeGuard guard; - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(args); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - auto key = args.Convert2ConvCacheKey(); - auto& cache = - phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter(); - if (cache.Find(key)) { - auto t = cache.Get(key); - result.algo = static_cast(t.algo); - result.workspace_size = t.workspace_size; - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - } else { - result = FindAlgoHeuristic(args, ctx); - } - phi::autotune::DnnNode node(static_cast(result.algo), - result.workspace_size); - cache.Set(key, node); - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo - << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; - return result; - } + static const std::string GetPerfName() { return "ConvBackwardFilter"; } static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionBwdFilterAlgo_t algo) { @@ -675,7 +533,7 @@ struct SearchAlgorithm { return workspace_size; } - private: + protected: static SearchResult FindAlgoDeterministic(const ConvArgs& args) { auto workspace_size = GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1); @@ -892,5 +750,103 @@ struct SearchAlgorithm { } }; +template +struct SearchAlgorithm : public SearchAlgorithmBase { + using AlgoT = typename SearchAlgorithmBase::AlgoT; + + template + static SearchResult Find(const phi::GPUContext& ctx, + const ConvArgs& args, + bool exhaustive_search, + bool deterministic, + bool enable_autotune = true) { + SearchResult result; + bool use_autotune = false; + auto dtype = platform::CudnnDataType::type; + SetConvMathType(ctx, dtype, args.cdesc); + + if (deterministic) { + result = SearchAlgorithmBase::FindAlgoDeterministic(args); + } else { + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, run heuristic (default) before + // auto-tune process, run exhaustive_search during mentioned process. + // Auto tune is only enabled between specified range. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + auto key = args.Convert2ConvCacheKey(); + auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv( + SearchAlgorithmBase::kAlgoType); + bool find_in_cache = cache.Find(key); + if (find_in_cache) { + auto t = cache.Get(key); + result.algo = static_cast(t.algo); + result.workspace_size = t.workspace_size; + result.exhaustive_search = t.exhaustive_search; + } + if (!result.exhaustive_search) { + bool need_update_cache = false; + // In conv2d_tranpose, enable_autotune is set to false because some + // algorithm picked by exhaustive search method produce wrong result. + use_autotune = enable_autotune && + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + // Once autotune is enabled, the autotuned result can rewrite the + // previous result in cache found by heuristic method. + result = + SearchAlgorithmBase::template FindAlgoExhaustiveSearch( + args, ctx); + need_update_cache = true; + } else if (!find_in_cache) { + result = SearchAlgorithmBase::FindAlgoHeuristic(args, ctx); + need_update_cache = true; + } + if (need_update_cache) { + phi::autotune::ConvAutoTuneResult node( + static_cast(result.algo), + result.workspace_size, + exhaustive_search || use_autotune); + cache.Set(key, node); + } + } + } + VLOG(3) << "[cuDNN " << SearchAlgorithmBase::GetPerfName() + << "] exhaustive_search=" << exhaustive_search + << ", use_autotune=" << use_autotune + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo + << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; + return result; + } + + static void SetConvMathType(const phi::GPUContext& ctx, + cudnnDataType_t dtype, + const platform::ConvolutionDescriptor& cdesc) { +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "Enable Tensor Core for FLOAT16"; +#if CUDA_VERSION >= 11000 +#if CUDNN_VERSION_MIN(8, 1, 0) + } else if (ctx.GetComputeCapability() >= 80 && + dtype == CUDNN_DATA_BFLOAT16) { + VLOG(5) << "Enable Tensor Core for BFLOAT16"; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_TENSOR_OP_MATH)); +#endif // CUDNN_VERSION_MIN(8, 1, 0) + } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { + VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT"; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_FMA_MATH)); +#endif // CUDA_VERSION >= 11000 + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_DEFAULT_MATH)); + } +#endif + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 719a28cf32f..648116647b0 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -56,6 +56,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context, out_t.device(place) = in_t.slice(offsets, extents); } +template +struct SearchAlgorithm {}; + template <> struct SearchAlgorithm { using perf_t = miopenConvAlgoPerf_t; diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index aacebc66570..dc639e9f21e 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -56,12 +56,14 @@ struct hash> { namespace phi { namespace autotune { -struct DnnNode { - DnnNode() {} - explicit DnnNode(int64_t a, size_t size) : algo(a), workspace_size(size) {} +struct ConvAutoTuneResult { + ConvAutoTuneResult() {} + ConvAutoTuneResult(int64_t a, size_t size, bool search) + : algo(a), workspace_size(size), exhaustive_search(search) {} int64_t algo; size_t workspace_size = 0; + bool exhaustive_search = false; }; template @@ -73,40 +75,41 @@ size_t GetKey(Args&&... args) { struct ConvCacheKey { ConvCacheKey() {} - explicit ConvCacheKey(const std::vector& x_dims, - const std::vector& w_dims, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - phi::DataType dtype, - int groups, - int64_t data_layout) - : x_dims_(x_dims), - w_dims_(w_dims), - strides_(strides), - paddings_(paddings), - dilations_(dilations), - dtype_(dtype), - groups_(groups), - data_layout_(data_layout) {} + ConvCacheKey(const std::vector& arg_x_dims, + const std::vector& arg_w_dims, + const std::vector& arg_strides, + const std::vector& arg_paddings, + const std::vector& arg_dilations, + phi::DataType arg_dtype, + int arg_groups, + int64_t arg_data_layout) + : x_dims(arg_x_dims), + w_dims(arg_w_dims), + strides(arg_strides), + paddings(arg_paddings), + dilations(arg_dilations), + dtype(arg_dtype), + groups(arg_groups), + data_layout(arg_data_layout) {} size_t hash_value() const { - return GetKey(x_dims_, - w_dims_, - strides_, - paddings_, - dilations_, - static_cast(dtype_), - groups_, - data_layout_); + return GetKey(x_dims, + w_dims, + strides, + paddings, + dilations, + static_cast(dtype), + groups, + data_layout); } - std::vector x_dims_; - std::vector w_dims_; - std::vector strides_; - std::vector paddings_; - std::vector dilations_; - phi::DataType dtype_; - int groups_; - int64_t data_layout_; + + std::vector x_dims; + std::vector w_dims; + std::vector strides; + std::vector paddings; + std::vector dilations; + phi::DataType dtype; + int groups; + int64_t data_layout; }; struct ConvCacheKeyHash { @@ -118,14 +121,14 @@ struct ConvCacheKeyHash { struct ConvCacheKeyEqual { size_t operator()(const ConvCacheKey& first, const ConvCacheKey& second) const { - if (first.x_dims_ != second.x_dims_) return false; - if (first.w_dims_ != second.w_dims_) return false; - if (first.strides_ != second.strides_) return false; - if (first.paddings_ != second.paddings_) return false; - if (first.dilations_ != second.dilations_) return false; - if (first.dtype_ != second.dtype_) return false; - if (first.groups_ != second.groups_) return false; - if (first.data_layout_ != second.data_layout_) return false; + if (first.x_dims != second.x_dims) return false; + if (first.w_dims != second.w_dims) return false; + if (first.strides != second.strides) return false; + if (first.paddings != second.paddings) return false; + if (first.dilations != second.dilations) return false; + if (first.dtype != second.dtype) return false; + if (first.groups != second.groups) return false; + if (first.data_layout != second.data_layout) return false; return true; } @@ -135,7 +138,7 @@ class CudnnAlgorithmsCacheMap { public: CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); } - DnnNode Get(const ConvCacheKey& key) { + ConvAutoTuneResult Get(const ConvCacheKey& key) { std::lock_guard lock(*cache_mutex_); PADDLE_ENFORCE_NE( hash_.find(key), @@ -163,7 +166,7 @@ class CudnnAlgorithmsCacheMap { cache_misses_ = 0; } - void Set(const ConvCacheKey& key, DnnNode algo) { + void Set(const ConvCacheKey& key, ConvAutoTuneResult algo) { std::lock_guard lock(*cache_mutex_); if (hash_.size() > static_cast(FLAGS_search_cache_max_number)) { hash_.clear(); @@ -188,7 +191,10 @@ class CudnnAlgorithmsCacheMap { int64_t Size() const { return hash_.size(); } private: - std::unordered_map + std::unordered_map hash_; std::shared_ptr cache_mutex_; @@ -289,19 +295,8 @@ class AutoTuneCache { return auto_tune_map_[static_cast(algo_type)]; } - CudnnAlgorithmsCacheMap& GetConvForward() { - return cudnn_auto_tune_map_[static_cast( - AlgorithmType::kConvForward)]; - } - - CudnnAlgorithmsCacheMap& GetConvBackwardData() { - return cudnn_auto_tune_map_[static_cast( - AlgorithmType::kConvBackwardData)]; - } - - CudnnAlgorithmsCacheMap& GetConvBackwardFilter() { - return cudnn_auto_tune_map_[static_cast( - AlgorithmType::kConvBackwardFilter)]; + CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) { + return cudnn_auto_tune_map_[static_cast(algo_type)]; } AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); } diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc index 29affd45f0f..18454ad3e19 100644 --- a/paddle/phi/kernels/autotune/cache_test.cc +++ b/paddle/phi/kernels/autotune/cache_test.cc @@ -25,7 +25,8 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 }; TEST(AlgosCache, AlgosCache) { auto autotune_cache = phi::autotune::AutoTuneCache::Instance(); - auto& cache = autotune_cache.GetConvForward(); + auto& cache = + autotune_cache.GetConv(phi::autotune::AlgorithmType::kConvForward); std::vector x_shape = {4, 224, 224, 3}; std::vector w_shape = {32, 3, 3, 3}; @@ -37,7 +38,8 @@ TEST(AlgosCache, AlgosCache) { phi::autotune::ConvCacheKey key( x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0); EXPECT_EQ(cache.Find(key), false); - phi::autotune::DnnNode node(static_cast(ConvAlgos::GEMMKernel), 0); + phi::autotune::ConvAutoTuneResult node( + static_cast(ConvAlgos::GEMMKernel), 0, false); cache.Set(key, node); EXPECT_EQ(cache.Size(), 1); EXPECT_EQ(cache.Find(key), true); @@ -48,8 +50,8 @@ TEST(AlgosCache, AlgosCache) { phi::autotune::ConvCacheKey key1( x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1); EXPECT_EQ(cache.Find(key1), false); - phi::autotune::DnnNode node1(static_cast(ConvAlgos::CuDNNKernel_1), - 0); + phi::autotune::ConvAutoTuneResult node1( + static_cast(ConvAlgos::CuDNNKernel_1), 0, false); cache.Set(key1, node1); EXPECT_EQ(cache.Size(), 2); EXPECT_EQ(cache.CacheHits(), 1); diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu index fb9580427e1..e61f58450b3 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu @@ -336,7 +336,7 @@ void ConvCudnnGradGradKernel( #else using search1 = paddle::operators::SearchAlgorithm; - fwd_result1 = search1::Find(args1, exhaustive_search, false, ctx); + fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); #endif } @@ -364,7 +364,7 @@ void ConvCudnnGradGradKernel( #else using search2 = paddle::operators::SearchAlgorithm; - fwd_result2 = search2::Find(args2, exhaustive_search, false, ctx); + fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); #endif @@ -394,7 +394,7 @@ void ConvCudnnGradGradKernel( using search3 = paddle::operators::SearchAlgorithm; filter_result = - search3::Find(args3, exhaustive_search, deterministic, ctx); + search3::Find(ctx, args3, exhaustive_search, deterministic); workspace_size = std::max( workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif @@ -424,7 +424,7 @@ void ConvCudnnGradGradKernel( using search4 = paddle::operators::SearchAlgorithm; data_result = - search4::Find(args4, exhaustive_search, deterministic, ctx); + search4::Find(ctx, args4, exhaustive_search, deterministic); workspace_size = std::max( workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); #endif diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index bc7a8b4f378..2d61ec6e62c 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -373,7 +373,7 @@ void ConvCudnnGradKernel(const Context& ctx, #else using search1 = paddle::operators::SearchAlgorithm; - bwd_result = search1::Find(args1, exhaustive_search, deterministic, ctx); + bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size); #endif } @@ -402,7 +402,7 @@ void ConvCudnnGradKernel(const Context& ctx, using search2 = paddle::operators::SearchAlgorithm; filter_result = - search2::Find(args2, exhaustive_search, deterministic, ctx); + search2::Find(ctx, args2, exhaustive_search, deterministic); VLOG(3) << "filter algo: " << filter_result.algo << ", time " << filter_result.time; workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size); diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index aa591a34a43..7a6e8d8148f 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -25,7 +25,6 @@ #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" @@ -56,8 +55,7 @@ void ConvCudnnKernel(const Context& ctx, bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, + PADDLE_ENFORCE_EQ(exhaustive_search && deterministic, false, phi::errors::InvalidArgument( "Cann't set exhaustive_search True and " @@ -315,7 +313,7 @@ void ConvCudnnKernel(const Context& ctx, paddle::operators::SearchResult fwd_result; using search = paddle::operators::SearchAlgorithm; - fwd_result = search::Find(args, exhaustive_search, deterministic, ctx); + fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); workspace_size = fwd_result.workspace_size; #endif diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index 3acb1604f4a..d05bd58e330 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -230,7 +230,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, #else using search1 = paddle::operators::SearchAlgorithm; - fwd_result = search1::Find(args1, false, deterministic, ctx); + fwd_result = search1::Find(ctx, args1, false, deterministic, false); workspace_size = std::max( workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); #endif @@ -257,7 +257,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, #else using search2 = paddle::operators::SearchAlgorithm; - filter_result = search2::Find(args2, false, deterministic, ctx); + filter_result = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); #endif @@ -710,7 +710,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search1 = paddle::operators::SearchAlgorithm; - bwd_result1 = search1::Find(args1, false, deterministic, ctx); + bwd_result1 = search1::Find(ctx, args1, false, deterministic, false); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); #endif @@ -734,7 +734,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search2 = paddle::operators::SearchAlgorithm; - bwd_result2 = search2::Find(args2, false, deterministic, ctx); + bwd_result2 = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); #endif @@ -761,7 +761,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search3 = paddle::operators::SearchAlgorithm; - filter_result = search3::Find(args3, false, deterministic, ctx); + filter_result = search3::Find(ctx, args3, false, deterministic, false); workspace_size = std::max( workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif @@ -789,7 +789,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search4 = paddle::operators::SearchAlgorithm; - fwd_result = search4::Find(args4, false, deterministic, ctx); + fwd_result = search4::Find(ctx, args4, false, deterministic, false); workspace_size = std::max( workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); #endif diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 6fc1e2eff13..84332f0ccb8 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -230,7 +230,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, paddle::operators::SearchResult bwd_result; using search = paddle::operators::SearchAlgorithm; - bwd_result = search::Find(args, false, deterministic, ctx); + bwd_result = search::Find(ctx, args, false, deterministic, false); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); #endif diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py index 73075125ac4..aaf194ff95e 100644 --- a/tools/check_op_benchmark_result.py +++ b/tools/check_op_benchmark_result.py @@ -72,15 +72,20 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result): """ pr_gpu_time = pr_data.get("gpu_time") develop_gpu_time = develop_data.get("gpu_time") - gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time + if develop_gpu_time != 0.0: + gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time + gpu_time_diff_str = "{:.5f}".format(gpu_time_diff * 100) + else: + gpu_time_diff = None + gpu_time_diff_str = "" pr_total_time = pr_data.get("total") develop_total_time = develop_data.get("total") total_time_diff = (pr_total_time - develop_total_time) / develop_total_time logging.info("------ OP: %s ------" % case_name) - logging.info("GPU time change: %.5f%% (develop: %.7f -> PR: %.7f)" % - (gpu_time_diff * 100, develop_gpu_time, pr_gpu_time)) + logging.info("GPU time change: %s (develop: %.7f -> PR: %.7f)" % + (gpu_time_diff_str, develop_gpu_time, pr_gpu_time)) logging.info("Total time change: %.5f%% (develop: %.7f -> PR: %.7f)" % (total_time_diff * 100, develop_total_time, pr_total_time)) logging.info("backward: %s" % pr_result.get("backward")) @@ -196,7 +201,8 @@ if __name__ == "__main__": args.develop_logs_dir) check_path_exists(args.pr_logs_dir) - for log_file in os.listdir(args.pr_logs_dir): + pr_log_files = os.listdir(args.pr_logs_dir) + for log_file in sorted(pr_log_files): develop_result = develop_result_dict.get(log_file) pr_result = parse_log_file(os.path.join(args.pr_logs_dir, log_file)) if develop_result is None or pr_result is None: -- GitLab