From b937cdc51794ee5112f9ec948c4518b9931b72c9 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Sat, 9 Apr 2022 13:19:42 +0800 Subject: [PATCH] Autotune the workspace_size_limit in conv. (#40338) * Using the maximum workspace_size of all alogirhms to limit the workspace size in exhaustive search mode. * Use the system cudaMalloc and cudaFree to allocate workspace during searching. * Enable switch of two kind of workspace setting methods. Co-authored-by: Liu Yiqun --- paddle/fluid/framework/conv_search_cache.h | 1 - paddle/fluid/operators/conv_base_helper.h | 99 +++ paddle/fluid/operators/conv_cudnn_helper.h | 613 +++++++++--------- paddle/fluid/operators/conv_cudnn_op_cache.h | 2 +- paddle/fluid/operators/conv_miopen_helper.h | 72 +- .../fused/fusion_conv_inception_op.cu | 2 - paddle/fluid/platform/device/gpu/gpu_info.cc | 4 + paddle/fluid/platform/device_context.cc | 7 +- paddle/fluid/platform/flags.cc | 7 +- paddle/phi/backends/gpu/gpu_context.cc | 40 +- paddle/phi/backends/gpu/gpu_context.h | 16 +- paddle/phi/kernels/autotune/CMakeLists.txt | 4 +- .../kernels/gpudnn/conv_grad_grad_kernel.cu | 71 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 43 +- paddle/phi/kernels/gpudnn/conv_kernel.cu | 18 +- .../gpudnn/conv_transpose_grad_kernel.cu | 107 ++- .../kernels/gpudnn/conv_transpose_kernel.cu | 13 +- paddle/phi/kernels/impl/conv_cudnn_impl.h | 2 +- .../tests/unittests/test_switch_autotune.py | 12 + 19 files changed, 620 insertions(+), 513 deletions(-) create mode 100644 paddle/fluid/operators/conv_base_helper.h diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 51446f287e..4da2aeb4d0 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator_kernel_configs.h" - #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" namespace paddle { diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h new file mode 100644 index 0000000000..c664d1935f --- /dev/null +++ b/paddle/fluid/operators/conv_base_helper.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/conv_search_cache.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = platform::DataLayout; +using framework::AlgorithmsCache; +using framework::ConvSearchCache; + +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + +// As the basic for SearchAlgorithm struct. +template +struct SearchAlgorithm {}; + +// As the container of searchAlgorithm::Find() result. +template +struct SearchResult { + public: + AlgoT algo = static_cast(0); + float time = -1.f; + size_t workspace_size = 0; +}; + +// As the container of conv relevant descriptors. +template +struct ConvArgsBase { + HandleT handle; + platform::TensorDescriptor idesc, odesc; + platform::FilterDescriptor wdesc; + platform::ConvolutionDescriptor cdesc; + const framework::Tensor *x, *w, *o; + DataT cudnn_dtype; + + // strides + std::vector s; + // paddings + std::vector p; + // dilations + std::vector d; + + ConvArgsBase(const framework::Tensor* x, const framework::Tensor* w, + const framework::Tensor* o, const std::vector s, + const std::vector p, const std::vector d, DataT dtype) + : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} +}; + +static inline void GetNCDHW(const framework::DDim& dims, + const DataLayout& layout, int* N, int* C, int* D, + int* H, int* W) { + *N = dims[0]; + *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == DataLayout::kNCHW ? 0 : 1; + if (dims.size() == 5) { + *D = dims[2 - i]; + *H = dims[3 - i]; + *W = dims[4 - i]; + } else { + *D = 1; + *H = dims[2 - i]; + *W = dims[3 - i]; + } +} + +template +static std::ostream& operator<<(std::ostream& out, const std::vector& v) { + out << "["; + for (auto const& tmp : v) out << tmp << ","; + out << "]"; + return out; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 4e6fda3d09..3c29c60b21 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -14,44 +14,15 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/conv_search_cache.h" -#include "paddle/fluid/framework/operator_kernel_configs.h" -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/operators/conv_base_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DataLayout = platform::DataLayout; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; -using framework::AlgorithmsCache; -static inline void GetNCDHW(const framework::DDim& dims, - const DataLayout& layout, int* N, int* C, int* D, - int* H, int* W) { - *N = dims[0]; - *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == DataLayout::kNCHW ? 0 : 1; - if (dims.size() == 5) { - *D = dims[2 - i]; - *H = dims[3 - i]; - *W = dims[4 - i]; - } else { - *D = 1; - *H = dims[2 - i]; - *W = dims[3 - i]; - } -} +using ConvArgs = ConvArgsBase; template static void RemovePaddingSlice(const phi::GPUContext& context, @@ -68,121 +39,103 @@ static void RemovePaddingSlice(const phi::GPUContext& context, extents[i] = new_out_dims[i]; } - int start; for (size_t i = 0; i < axes.size(); ++i) { - start = starts[i]; + int start = starts[i]; if (start < 0) { start = (start + in_dims[axes[i]]); } start = std::max(start, 0); offsets[axes[i]] = start; } + auto in_t = framework::EigenTensor::From( *input); - auto out_t = framework::EigenTensor::From( *out, new_out_dims); - EigenSlice, T, D>::Eval(place, out_t, in_t, - offsets, extents); + + phi::funcs::EigenSlice, T, D>::Eval( + place, out_t, in_t, offsets, extents); } -template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { - out << "["; - for (auto const& tmp : v) out << tmp << ","; - out << "]"; - return out; +static inline double ToMegaBytes(size_t bytes) { + return static_cast(bytes) / (1 << 20); } -inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) { - int max_algos = 0; -#if CUDNN_VERSION_MIN(7, 0, 1) - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( - cudnn_handle, &max_algos)); -#endif - return max_algos; +static inline bool UseFixedWorkspace() { + return FLAGS_conv_workspace_size_limit >= 0; } -template -void ChooseAlgoByWorkspace(PerfType* perf_results, size_t perf_num, - size_t workspace_byte, AlgoType* algo) { - for (size_t i = 0; i < perf_num; ++i) { - auto result = perf_results[i]; - if (result.status == CUDNN_STATUS_SUCCESS && - result.memory < workspace_byte) { - *algo = result.algo; - VLOG(3) << " algo: " << result.algo << ", time: " << result.time - << " ms, wksp = " << result.memory - << ", status = " << result.status; - return; - } +static size_t CaclWorkspaceLimitInBytes(const phi::GPUContext& ctx) { + if (!UseFixedWorkspace()) { + int device_id = platform::GetCurrentDeviceId(); + int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id); + int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id); + int64_t availble = platform::GpuAvailableMemToAlloc(); + int64_t cur_workspace_size = ctx.cudnn_workspace_handle().WorkspaceSize(); + VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) + << " MB, reserved=" << ToMegaBytes(reserved) + << " MB, available_to_alloc=" << ToMegaBytes(availble) + << " MB, current_workspace_size=" << ToMegaBytes(cur_workspace_size) + << " MB."; + return std::max(std::max(availble, cur_workspace_size), + reserved - allocated); + } else { + return FLAGS_conv_workspace_size_limit * 1024 * 1024; } - VLOG(3) << "Can not find alog that requires memory < " - << static_cast(workspace_byte) / (1 << 20) << " MB"; } -template -void ChooseAlgo(const std::vector& perf_results, - size_t workspace_byte, AlgoType* algo) { - VLOG(3) << "=========BwdFilterAlgo Perf result========="; - for (const auto& result : perf_results) { - auto math_type_str = "False"; - if (result.mathType == CUDNN_TENSOR_OP_MATH) { - math_type_str = "True"; - } - VLOG(3) << " algo: " << result.algo << ", TensorCore: " << math_type_str - << ", time: " << result.time << " ms" - << ", wksp = " << result.memory << ", status = " << result.status; +template +std::string GetPerfResultString(std::string prefix, + const std::vector& perf_results, + int actual_algo_count, size_t workspace_limit) { + std::ostringstream out; + out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit) + << " MB):\n"; + for (int i = 0; i < actual_algo_count; ++i) { + const auto& result = perf_results[i]; + auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F"; + out << " algo=" << result.algo << ": tensor_core=" << math_type_str + << ", time=" << result.time + << " ms, memory=" << ToMegaBytes(result.memory) + << " MB, status=" << result.status << "\n"; } + return out.str(); +} - for (size_t i = 0; i != perf_results.size(); ++i) { - const auto& result = perf_results[i]; +template +void ChooseAlgoByWorkspace(const std::vector& perf_results, + size_t workspace_limit, + SearchResult* algo_result) { + for (size_t i = 0; i < perf_results.size(); ++i) { + auto result = perf_results[i]; if (result.status == CUDNN_STATUS_SUCCESS && - (result.memory <= workspace_byte)) { - if ((result.mathType == CUDNN_TENSOR_OP_MATH) && - (i != perf_results.size() - 1)) { - const auto& next_result = perf_results[i + 1]; - if (next_result.status == CUDNN_STATUS_SUCCESS && - next_result.algo == result.algo && - next_result.memory == result.memory && - next_result.mathType != CUDNN_TENSOR_OP_MATH && - next_result.time < 1.01 * result.time) { - // Skip over this result- it's not really a Tensor Core algo. - // Because it is only 1% performance difference. - // Prefer to choose the next equivalent non-Tensor Core algo. - continue; - } - } - *algo = result.algo; - auto math_type_str = "0"; - if (result.mathType == CUDNN_TENSOR_OP_MATH) { - math_type_str = "1"; - } - VLOG(3) << " choose algo: " << result.algo << ", TC: " << math_type_str - << ", time: " << result.time << " ms" - << ", wksp = " << result.memory << ", status = " << result.status; - break; + result.memory < workspace_limit) { + algo_result->algo = result.algo; + algo_result->time = result.time; + algo_result->workspace_size = result.memory; + VLOG(3) << " algo=" << result.algo << ", time=" << result.time + << " ms, memory=" << ToMegaBytes(result.memory) + << " MB (limit=" << ToMegaBytes(workspace_limit) + << " MB), status=" << result.status; + return; } } + VLOG(3) << "Can not find an algorithm that requires memory < " + << ToMegaBytes(workspace_limit) << " MB"; } -using framework::ConvSearchCache; - static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, const platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - auto& dev_ctx = ctx; - if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; #if CUDA_VERSION >= 11000 #if CUDNN_VERSION_MIN(8, 1, 0) - } else if (dev_ctx.GetComputeCapability() >= 80 && - dtype == CUDNN_DATA_BFLOAT16) { + } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); #endif // CUDNN_VERSION_MIN(8, 1, 0) @@ -198,76 +151,49 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, #endif } -struct ConvArgs { - cudnnHandle_t handle; - platform::TensorDescriptor idesc, odesc; - platform::FilterDescriptor wdesc; - platform::ConvolutionDescriptor cdesc; - const framework::Tensor *x, *w, *o; - cudnnDataType_t cudnn_dtype; - - // strides - std::vector s; - // paddings - std::vector p; - // dilations - std::vector d; - - ConvArgs(const framework::Tensor* x, const framework::Tensor* w, - const framework::Tensor* o, const std::vector s, - const std::vector p, const std::vector d, - cudnnDataType_t dtype) - : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} -}; - -template -struct SearchAlgorithm {}; - template <> struct SearchAlgorithm { - using perf_t = cudnnConvolutionFwdAlgoPerf_t; - using algo_t = cudnnConvolutionFwdAlgo_t; + using PerfT = cudnnConvolutionFwdAlgoPerf_t; + using AlgoT = cudnnConvolutionFwdAlgo_t; template - static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, const phi::GPUContext& ctx) { + static SearchResult Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, + const phi::GPUContext& ctx) { + SearchResult result; auto dtype = platform::CudnnDataType::type; - bool has_got_workspace_size = true; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - algo_t algo; + size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); if (!exhaustive_search && !deterministic) { #if CUDNN_VERSION >= 7001 - int perf_count; + int actual_perf_count; int best_algo_idx = 0; - std::unique_ptr perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, - &perf_count, perf_results.get())); - algo = (perf_results.get())[best_algo_idx].algo; - workspace_size = (perf_results.get())[best_algo_idx].memory; + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; - if (workspace_size > workspace_size_limit) { + if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results.get(), - kNUM_CUDNN_FWD_ALGS, - workspace_size_limit, &algo); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); #else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << workspace_size << ") exceeds the limit(" + VLOG(3) << "Fallback to non-v7 method to find conv algorithm " + "becasue the workspace size request(" + << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardAlgorithm( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } #else @@ -276,30 +202,30 @@ struct SearchAlgorithm { args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif - VLOG(3) << "choose algo " << algo; } else if (deterministic) { - algo = static_cast(1); + result.algo = static_cast(1); } else { - auto& dev_ctx = ctx; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - + auto workspace_handle = ctx.cudnn_workspace_handle(); auto x_dims = phi::vectorize(args.x->dims()); auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" << args.p << ", args.d" << args.d; - algo = algo_cache.GetAlgorithm( + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetForward()); + + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { int returned_algo_count; - std::array perf_stat; + std::vector perf_results(kNUM_CUDNN_FWD_ALGS); + size_t max_workspace_size = + FindMaxWorkspaceSize(args, workspace_size_limit); + VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( @@ -308,25 +234,28 @@ struct SearchAlgorithm { args.wdesc.desc(), args.w->data(), args.cdesc.desc(), args.odesc.desc(), const_cast(args.o->data()), kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + perf_results.data(), cudnn_workspace_ptr, + max_workspace_size)); }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - return perf_stat[0].algo; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(4) << GetPerfResultString( + "[Exhaustive Search] FwdAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + result.time = perf_results[0].time; + return perf_results[0].algo; }); } - VLOG(3) << "choose algo " << algo; - return algo; + VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo << ", workspace=" + << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; + return result; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args, + cudnnConvolutionFwdAlgo_t algo) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( @@ -334,68 +263,84 @@ struct SearchAlgorithm { args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size)); return workspace_size; } + + private: + static size_t FindMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { + if (!UseFixedWorkspace()) { + size_t max_workspace_size = 0; + for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) { + size_t workspace_size = 0; + auto status = + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + static_cast(algo), &workspace_size); + if (status == CUDNN_STATUS_SUCCESS) { + max_workspace_size = std::max(workspace_size, max_workspace_size); + } + } + return std::min(max_workspace_size, workspace_size_limit); + } else { + return workspace_size_limit; + } + } }; template <> struct SearchAlgorithm { - using perf_t = cudnnConvolutionBwdDataAlgoPerf_t; - using algo_t = cudnnConvolutionBwdDataAlgo_t; + using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; + using AlgoT = cudnnConvolutionBwdDataAlgo_t; template - static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, const phi::GPUContext& ctx) { + static SearchResult Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, + const phi::GPUContext& ctx) { + SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; - algo_t algo; + size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); if (!exhaustive_search && !deterministic) { #if CUDNN_VERSION >= 7001 - int perf_count; + int actual_perf_count; int best_algo_idx = 0; - std::unique_ptr perf_results( - new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, - &perf_count, perf_results.get())); - algo = (perf_results.get())[best_algo_idx].algo; + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; #if CUDNN_VERSION < 7500 int stride_dim = args.x->dims().size() - 2; bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, [=](int n) { return n != 1; }); - if (blacklist && (static_cast( - perf_results[best_algo_idx].algo) == + if (blacklist && (perf_results[best_algo_idx].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || - static_cast( - perf_results[best_algo_idx].algo) == + perf_results[best_algo_idx].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { - algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } #endif - workspace_size = GetWorkspaceSize(args, algo); - if (workspace_size > workspace_size_limit) { - has_got_workspace_size = false; + result.workspace_size = GetWorkspaceSize(args, result.algo); + if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results.get(), - kNUM_CUDNN_BWD_DATA_ALGS, - workspace_size_limit, &algo); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); #else VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " "the workspace size request(" - << workspace_size << ") exceeds the limit(" + << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } #else @@ -404,29 +349,29 @@ struct SearchAlgorithm { args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } else if (deterministic) { - return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { - auto& dev_ctx = ctx; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - + auto workspace_handle = ctx.cudnn_workspace_handle(); auto x_dims = phi::vectorize(args.x->dims()); auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" << args.p << ", args.d" << args.d; - algo = algo_cache.GetAlgorithm( + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetBackwardData()); + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { int returned_algo_count; - std::array perf_stat; + std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); + size_t max_workspace_size = + FindMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size) + << " MB"; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( @@ -437,26 +382,28 @@ struct SearchAlgorithm { args.cdesc.desc(), args.idesc.desc(), const_cast(args.x->data()), kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + perf_results.data(), cudnn_workspace_ptr, + max_workspace_size)); }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - - return perf_stat[0].algo; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(3) << GetPerfResultString( + "[Exhaustive Search] BwdDataAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + result.time = perf_results[0].time; + return perf_results[0].algo; }); } - VLOG(3) << "choose algo " << algo; - return algo; + VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo << ", workspace=" + << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; + return result; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args, + cudnnConvolutionBwdDataAlgo_t algo) { size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( @@ -464,57 +411,75 @@ struct SearchAlgorithm { args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size)); return workspace_size; } + + private: + static size_t FindMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { + if (!UseFixedWorkspace()) { + size_t max_workspace_size = 0; + for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) { + size_t workspace_size = 0; + auto status = + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + static_cast(algo), + &workspace_size); + if (status == CUDNN_STATUS_SUCCESS) { + max_workspace_size = std::max(workspace_size, max_workspace_size); + } + } + return std::min(max_workspace_size, workspace_size_limit); + } else { + return workspace_size_limit; + } + } }; template <> struct SearchAlgorithm { - using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; - using algo_t = cudnnConvolutionBwdFilterAlgo_t; + using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; + using AlgoT = cudnnConvolutionBwdFilterAlgo_t; template - static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, const phi::GPUContext& ctx) { + static SearchResult Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, + const phi::GPUContext& ctx) { platform::CUDAGraphCaptureModeGuard guard; + SearchResult result; auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; + size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx); SetConvMathType(ctx, dtype, args.cdesc); - algo_t algo; if (!exhaustive_search && !deterministic) { #if CUDNN_VERSION >= 7001 - using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; - int perf_count; + int actual_perf_count; int best_algo_idx = 0; - std::unique_ptr perf_results( - new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, - &perf_count, perf_results.get())); - algo = (perf_results.get())[best_algo_idx].algo; - workspace_size = (perf_results.get())[best_algo_idx].memory; + &actual_perf_count, perf_results.data())); + result.algo = perf_results[best_algo_idx].algo; + result.workspace_size = perf_results[best_algo_idx].memory; - if (workspace_size > workspace_size_limit) { - workspace_size = workspace_size_limit; + if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 - ChooseAlgoByWorkspace(perf_results.get(), - kNUM_CUDNN_BWD_FILTER_ALGS, - workspace_size_limit, &algo); + ChooseAlgoByWorkspace(perf_results, workspace_size_limit, + &result); #else VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " "the workspace size request(" - << workspace_size << ") exceeds the limit(" + << result.workspace_size << ") exceeds the limit(" << workspace_size_limit << ")"; PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } #else @@ -523,28 +488,32 @@ struct SearchAlgorithm { args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit, &(result.algo))); #endif } else if (deterministic) { - return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + result.algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { - auto& dev_ctx = ctx; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - + auto workspace_handle = ctx.cudnn_workspace_handle(); auto x_dims = phi::vectorize(args.x->dims()); auto w_dims = phi::vectorize(args.w->dims()); - VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" << args.p << ", args.d" << args.d; + + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetBackwardFilter()); + if (dtype != CUDNN_DATA_HALF) { - algo = algo_cache.GetAlgorithm( + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { int returned_algo_count; - std::array perf_stat; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + size_t max_workspace_size = + FindMaxWorkspaceSize(args, workspace_size_limit); + VLOG(3) << "max_workspace_size=" + << ToMegaBytes(max_workspace_size) << " MB"; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: @@ -554,29 +523,26 @@ struct SearchAlgorithm { args.cdesc.desc(), args.wdesc.desc(), const_cast(args.w->data()), kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + perf_results.data(), cudnn_workspace_ptr, + max_workspace_size)); }; - workspace_handle.RunFuncSync(cudnn_find_func, - workspace_size_limit); - - VLOG(3) - << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - return perf_stat[0].algo; + workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size, + UseFixedWorkspace()); + + VLOG(3) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + returned_algo_count, workspace_size_limit); + result.time = perf_results[0].time; + return perf_results[0].algo; }); } else { - auto max_algos = MaxBwdFilterAlgos(args.handle); - algo = algo_cache.GetAlgorithm( + result.algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, static_cast(args.cudnn_dtype), [&]() { - algo_t chosen_algo; - std::vector perf_results(max_algos); + SearchResult algo_result; int actual_algos = 0; + std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload:: cudnnFindConvolutionBackwardFilterAlgorithm( @@ -585,17 +551,21 @@ struct SearchAlgorithm { perf_results.size(), &actual_algos, perf_results.data())); perf_results.resize(actual_algos); - ChooseAlgo(perf_results, workspace_size_limit, - &chosen_algo); - return chosen_algo; + ChooseAlgo(perf_results, workspace_size_limit, &algo_result); + result.time = algo_result.time; + return algo_result.algo; }); } } - VLOG(3) << "choose algo " << algo; - return algo; + VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo << ", workspace=" + << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; + return result; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args, + cudnnConvolutionBwdFilterAlgo_t algo) { platform::CUDAGraphCaptureModeGuard guard; size_t workspace_size = 0; PADDLE_ENFORCE_GPU_SUCCESS( @@ -604,6 +574,69 @@ struct SearchAlgorithm { args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size)); return workspace_size; } + + private: + static size_t FindMaxWorkspaceSize(const ConvArgs& args, + size_t workspace_size_limit) { + if (!UseFixedWorkspace()) { + size_t max_workspace_size = 0; + for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) { + size_t workspace_size = 0; + auto status = + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + static_cast(algo), + &workspace_size); + if (status == CUDNN_STATUS_SUCCESS) { + max_workspace_size = std::max(workspace_size, max_workspace_size); + } + } + return std::min(max_workspace_size, workspace_size_limit); + } else { + return workspace_size_limit; + } + } + + static void ChooseAlgo(const std::vector& perf_results, + size_t workspace_limit, + SearchResult* algo_result) { + VLOG(3) << GetPerfResultString( + "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results, + perf_results.size(), workspace_limit); + + for (size_t i = 0; i != perf_results.size(); ++i) { + const auto& result = perf_results[i]; + if (result.status == CUDNN_STATUS_SUCCESS && + (result.memory <= workspace_limit)) { + if ((result.mathType == CUDNN_TENSOR_OP_MATH) && + (i != perf_results.size() - 1)) { + const auto& next_result = perf_results[i + 1]; + if (next_result.status == CUDNN_STATUS_SUCCESS && + next_result.algo == result.algo && + next_result.memory == result.memory && + next_result.mathType != CUDNN_TENSOR_OP_MATH && + next_result.time < 1.01 * result.time) { + // Skip over this result- it's not really a Tensor Core algo. + // Because it is only 1% performance difference. + // Prefer to choose the next equivalent non-Tensor Core algo. + continue; + } + } + algo_result->algo = result.algo; + algo_result->time = result.time; + auto math_type_str = "0"; + if (result.mathType == CUDNN_TENSOR_OP_MATH) { + math_type_str = "1"; + } + VLOG(3) << " choose algo: " << result.algo + << ", TC: " << math_type_str << ", time: " << result.time + << " ms, wksp = " << result.memory + << ", status = " << result.status; + break; + } + } + } }; } // namespace operators diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index 291e5f92f3..af67d857e0 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -DECLARE_uint64(conv_workspace_size_limit); +DECLARE_int64(conv_workspace_size_limit); DECLARE_bool(cudnn_exhaustive_search); DECLARE_int64(cudnn_exhaustive_search_times); diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 66f7186938..abc7be7fb8 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -14,42 +14,12 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/conv_search_cache.h" -#include "paddle/fluid/framework/operator_kernel_configs.h" -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/fluid/operators/conv_base_helper.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using DataLayout = platform::DataLayout; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; -using framework::AlgorithmsCache; -static inline void GetNCDHW(const framework::DDim& dims, - const DataLayout& layout, int* N, int* C, int* D, - int* H, int* W) { - *N = dims[0]; - *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - int i = layout == DataLayout::kNCHW ? 0 : 1; - if (dims.size() == 5) { - *D = dims[2 - i]; - *H = dims[3 - i]; - *W = dims[4 - i]; - } else { - *D = 1; - *H = dims[2 - i]; - *W = dims[3 - i]; - } -} +using ConvArgs = ConvArgsBase; template static void RemovePaddingSlice(const phi::GPUContext& context, @@ -66,9 +36,8 @@ static void RemovePaddingSlice(const phi::GPUContext& context, extents[i] = new_out_dims[i]; } - int start; for (size_t i = 0; i < axes.size(); ++i) { - start = starts[i]; + int start = starts[i]; if (start < 0) { start = (start + in_dims[axes[i]]); } @@ -85,41 +54,6 @@ static void RemovePaddingSlice(const phi::GPUContext& context, out_t.device(place) = in_t.slice(offsets, extents); } -template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { - out << "["; - for (auto const& tmp : v) out << tmp << ","; - out << "]"; - return out; -} - -using framework::ConvSearchCache; - -struct ConvArgs { - miopenHandle_t handle; - platform::TensorDescriptor idesc, odesc; - platform::FilterDescriptor wdesc; - platform::ConvolutionDescriptor cdesc; - const framework::Tensor *x, *w, *o; - miopenDataType_t cudnn_dtype; - - // strides - std::vector s; - // paddings - std::vector p; - // dilations - std::vector d; - - ConvArgs(const framework::Tensor* x, const framework::Tensor* w, - const framework::Tensor* o, const std::vector s, - const std::vector p, const std::vector d, - miopenDataType_t dtype) - : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} -}; - -template -struct SearchAlgorithm {}; - template <> struct SearchAlgorithm { using perf_t = miopenConvAlgoPerf_t; diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 39b42ec194..bd7134f2f3 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -16,8 +16,6 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -DECLARE_uint64(conv_workspace_size_limit); - namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index a671381d07..89e3b74bb3 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -188,6 +188,8 @@ class RecordedGpuMallocHelper { if (UNLIKELY(malloc_managed_memory)) { result = cudaMallocManaged(ptr, size); } else { + VLOG(10) << "[cudaMalloc] size=" << static_cast(size) / (1 << 20) + << " MB"; result = cudaMalloc(ptr, size); } #endif @@ -226,6 +228,8 @@ class RecordedGpuMallocHelper { if (err != hipErrorDeinitialized) { #else auto err = cudaFree(ptr); + VLOG(10) << "[cudaFree] size=" << static_cast(size) / (1 << 20) + << " MB"; if (err != cudaErrorCudartUnloading) { #endif PADDLE_ENFORCE_GPU_SUCCESS(err); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f3934c7d87..904e4854ba 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -522,8 +522,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) { cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place)); auto& instance = memory::allocation::AllocatorFacade::Instance(); instance.SetDefaultStream(place, phi::GPUContext::stream()); - workspace_.reset( - new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get())); + workspace_.reset(new phi::DnnWorkspaceHandle( + instance.GetAllocator(place).get(), stream())); } CUDADeviceContext::~CUDADeviceContext() = default; @@ -623,7 +623,8 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { return phi::DnnWorkspaceHandle( memory::allocation::AllocatorFacade::Instance() .GetAllocator(GetPlace()) - .get()); + .get(), + stream()); } return phi::GPUContext::cudnn_workspace_handle(); } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index c3d3f6a4f6..8209c0a5d6 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -161,10 +161,9 @@ PADDLE_DEFINE_EXPORTED_bool( * increased. * Users need to balance memory and speed. */ -PADDLE_DEFINE_EXPORTED_uint64( - conv_workspace_size_limit, - paddle::platform::kDefaultConvWorkspaceSizeLimitMB, - "cuDNN convolution workspace limit in MB unit."); +PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit, + paddle::platform::kDefaultConvWorkspaceSizeLimitMB, + "cuDNN convolution workspace limit in MB unit."); /** * CUDNN related FLAG diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 0394835aa8..ff238b7997 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -12,6 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include "paddle/phi/backends/gpu/gpu_context.h" #include #include @@ -155,6 +156,39 @@ static void StreamCallbackFunc(gpuStream_t stream, } // namespace internal +void DnnWorkspaceHandle::RunFuncSync( + const std::function& cudnn_func, + size_t required_workspace_bytes, + bool use_cached_allocation) { + bool need_realloc = required_workspace_bytes > WorkspaceSize(); + if (need_realloc && !use_cached_allocation) { + void* workspace_ptr = nullptr; + size_t size = ((required_workspace_bytes + 255) >> 8) << 8; + std::lock_guard guard(*mtx_); +#ifdef PADDLE_WITH_HIP + auto status = hipMalloc(&workspace_ptr, size); +#else + auto status = cudaMalloc(&workspace_ptr, size); +#endif + if (status == gpuSuccess) { + cudnn_func(workspace_ptr); + phi::backends::gpu::GpuStreamSync(stream_); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr)); +#endif + return; + } + } + + RunFunc(cudnn_func, required_workspace_bytes); + if (need_realloc) { + // Release the workspace allocated in this running. + ResetWorkspace(); + } +} + void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; } void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { @@ -295,13 +329,13 @@ struct GPUContext::Impl { void InitDnnWorkspace() { PD_CHECK(allocator_ != nullptr, "the device allocator for gpu context is nullptr."); - workspace_ = new DnnWorkspaceHandle(allocator_); + workspace_ = new DnnWorkspaceHandle(allocator_, stream_); } void DestoryInternalWorkspace() { if (owned_ && workspace_ != nullptr) { delete workspace_; - stream_ = nullptr; + workspace_ = nullptr; } } @@ -313,7 +347,7 @@ struct GPUContext::Impl { DnnWorkspaceHandle GetDnnWorkspace() { PD_CHECK(allocator_ != nullptr, "the device allocator for gpu context is nullptr."); - return DnnWorkspaceHandle(allocator_); + return DnnWorkspaceHandle(allocator_, stream_); } void InitStream() { diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index cd08da1c0f..ffae1f1f13 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_helper.h" +#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/device_context.h" @@ -28,8 +29,8 @@ namespace phi { class DnnWorkspaceHandle { public: - explicit inline DnnWorkspaceHandle(Allocator* allocator) - : allocator_(allocator) { + inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream) + : allocator_(allocator), stream_(stream) { mtx_.reset(new std::mutex()); } @@ -48,11 +49,9 @@ class DnnWorkspaceHandle { * running the function. Currently this function is only used when cudnn * exhaustive searching and callers have to guarantee that the input function * is host blocking */ - inline void RunFuncSync(const std::function& cudnn_func, - size_t required_workspace_bytes) { - RunFunc(cudnn_func, required_workspace_bytes); - ResetWorkspace(); - } + void RunFuncSync(const std::function& cudnn_func, + size_t required_workspace_bytes, + bool use_cached_allocation = true); inline size_t WorkspaceSize() { if (allocation_ == nullptr) { @@ -70,7 +69,8 @@ class DnnWorkspaceHandle { private: Allocator::AllocationPtr allocation_{nullptr}; - Allocator* allocator_{nullptr}; + Allocator* allocator_{nullptr}; // Not owned + gpuStream_t stream_{nullptr}; // Not owned std::unique_ptr mtx_; }; diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index b933e0993d..f1702d883b 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -1,6 +1,6 @@ if (WITH_GPU) - nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) - nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) + nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) + nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) elseif (WITH_ROCM) hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu index 9c5e77d5fd..74525e63f4 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu @@ -289,21 +289,17 @@ void ConvCudnnGradGradKernel( dtype}; #ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t fwd_algo1 = static_cast(0); - miopenConvFwdAlgorithm_t fwd_algo2 = static_cast(0); - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); + paddle::operators::SearchResult fwd_result1; + paddle::operators::SearchResult fwd_result2; + paddle::operators::SearchResult data_result; + paddle::operators::SearchResult + filter_result; #else - cudnnConvolutionFwdAlgo_t fwd_algo1 = - static_cast(0); - cudnnConvolutionFwdAlgo_t fwd_algo2 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); + paddle::operators::SearchResult fwd_result1; + paddle::operators::SearchResult fwd_result2; + paddle::operators::SearchResult data_result; + paddle::operators::SearchResult + filter_result; #endif auto layout = paddle::platform::GetCudnnTensorFormat( @@ -332,13 +328,13 @@ void ConvCudnnGradGradKernel( using search1 = paddle::operators::SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - fwd_algo1 = search1::Find( + fwd_result1.algo = search1::Find( args1, exhaustive_search, false, workspace_size, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); + fwd_result1 = search1::Find(args1, exhaustive_search, false, ctx); + workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); #endif } @@ -360,14 +356,14 @@ void ConvCudnnGradGradKernel( paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_algo2 = search2::Find( + fwd_result2.algo = search2::Find( args2, exhaustive_search, false, workspace_size, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); + fwd_result2 = search2::Find(args2, exhaustive_search, false, ctx); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); #endif } } @@ -389,15 +385,15 @@ void ConvCudnnGradGradKernel( using search3 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find( + filter_result.algo = search3::Find( args3, exhaustive_search, deterministic, workspace_size, ctx); #else using search3 = paddle::operators::SearchAlgorithm; - filter_algo = + filter_result = search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); + workspace_size = std::max( + workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif } @@ -419,14 +415,15 @@ void ConvCudnnGradGradKernel( using search4 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find( + data_result.algo = search4::Find( args4, exhaustive_search, deterministic, workspace_size, ctx); #else using search4 = paddle::operators::SearchAlgorithm; - data_algo = search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); + data_result = + search4::Find(args4, exhaustive_search, deterministic, ctx); + workspace_size = std::max( + workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); #endif } @@ -471,7 +468,7 @@ void ConvCudnnGradGradKernel( args1.wdesc.desc(), w, args1.cdesc.desc(), - fwd_algo1, + fwd_result1.algo, &beta, args1.odesc.desc(), transformed_ddy_channel, @@ -492,7 +489,7 @@ void ConvCudnnGradGradKernel( args1.wdesc.desc(), w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, + fwd_result1.algo, workspace_ptr, workspace_size, &beta, @@ -517,7 +514,7 @@ void ConvCudnnGradGradKernel( args2.wdesc.desc(), ddw, args2.cdesc.desc(), - fwd_algo2, + fwd_result2.algo, &beta, args2.odesc.desc(), transformed_ddy_channel, @@ -538,7 +535,7 @@ void ConvCudnnGradGradKernel( args2.wdesc.desc(), ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, + fwd_result2.algo, workspace_ptr, workspace_size, &alpha, @@ -568,7 +565,7 @@ void ConvCudnnGradGradKernel( args3.idesc.desc(), ddx, args3.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args3.wdesc.desc(), dw, @@ -589,7 +586,7 @@ void ConvCudnnGradGradKernel( args3.odesc.desc(), transformed_dy_channel + i * group_offset_out, args3.cdesc.desc(), - filter_algo, + filter_result.algo, workspace_ptr, workspace_size, &beta, @@ -615,7 +612,7 @@ void ConvCudnnGradGradKernel( args4.wdesc.desc(), ddw, args4.cdesc.desc(), - data_algo, + data_result.algo, &beta, args4.idesc.desc(), transformed_dx, @@ -636,7 +633,7 @@ void ConvCudnnGradGradKernel( args4.odesc.desc(), transformed_dy_channel + i * group_offset_out, args4.cdesc.desc(), - data_algo, + data_result.algo, workspace_ptr, workspace_size, &beta, diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index e09c33380b..985371ede9 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -322,17 +322,16 @@ void ConvCudnnGradKernel(const Context& ctx, int group_offset_in = i_c / groups * i_h * i_w * i_d; int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_filter = transformed_filter_channel.numel() / groups; + // ------------------- cudnn backward algorithm --------------------- #ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result; + paddle::operators::SearchResult + filter_result; #else - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result; + paddle::operators::SearchResult + filter_result; #endif // input data workspace_size size_t workspace_size_d = 0; @@ -368,14 +367,14 @@ void ConvCudnnGradKernel(const Context& ctx, paddle::operators::SearchAlgorithm; workspace_size_d = std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find( + bwd_result.algo = search1::Find( args1, exhaustive_search, deterministic, workspace_size_d, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo)); + bwd_result = search1::Find(args1, exhaustive_search, deterministic, ctx); + workspace_size_d = std::max( + workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo)); #endif } @@ -397,15 +396,17 @@ void ConvCudnnGradKernel(const Context& ctx, paddle::operators::SearchAlgorithm; workspace_size_w = std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find( + filter_result.algo = search2::Find( args2, exhaustive_search, deterministic, workspace_size_w, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - filter_algo = + filter_result = search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size_w = std::max(workspace_size_w, - search2::GetWorkspaceSize(args2, filter_algo)); + VLOG(3) << "filter algo: " << filter_result.algo << ", time " + << filter_result.time; + workspace_size_w = std::max( + workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo)); #endif } @@ -439,7 +440,7 @@ void ConvCudnnGradKernel(const Context& ctx, args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, + bwd_result.algo, &beta, args1.idesc.desc(), temp_tensor_data, @@ -471,7 +472,7 @@ void ConvCudnnGradKernel(const Context& ctx, args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, + bwd_result.algo, &beta, args1.idesc.desc(), transformed_input_grad_data, @@ -494,7 +495,7 @@ void ConvCudnnGradKernel(const Context& ctx, args1.odesc.desc(), output_grad_data + i * group_offset_out, args1.cdesc.desc(), - data_algo, + bwd_result.algo, cudnn_workspace_ptr, workspace_size_d, &beta, @@ -554,7 +555,7 @@ void ConvCudnnGradKernel(const Context& ctx, args2.idesc.desc(), input_data, args2.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args2.wdesc.desc(), filter_grad_data, @@ -575,7 +576,7 @@ void ConvCudnnGradKernel(const Context& ctx, args2.odesc.desc(), output_grad_data + i * group_offset_out, args2.cdesc.desc(), - filter_algo, + filter_result.algo, cudnn_workspace_ptr, workspace_size_w, &beta_filter, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index c2970cc8cd..37f66e0b25 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -18,7 +18,6 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/fluid/framework/eigen.h" #ifdef PADDLE_WITH_HIP #include "paddle/fluid/operators/conv_miopen_helper.h" #else @@ -68,7 +67,6 @@ void ConvCudnnKernel(const Context& ctx, "FLAGS_cudnn_deterministic True at same time.")); const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - auto dtype = paddle::platform::CudnnDataType::type; #ifdef PADDLE_WITH_HIP @@ -309,17 +307,17 @@ void ConvCudnnKernel(const Context& ctx, size_t workspace_size = 0; // final workspace to allocate. // ------------------- cudnn conv algorithm --------------------- #ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t algo{}; + paddle::operators::SearchResult fwd_result; using search = paddle::operators::SearchAlgorithm; workspace_size = search::GetWorkspaceSize(args); - algo = search::Find( + fwd_result.algo = search::Find( args, exhaustive_search, deterministic, workspace_size, ctx); #else - cudnnConvolutionFwdAlgo_t algo{}; + paddle::operators::SearchResult fwd_result; using search = paddle::operators::SearchAlgorithm; - algo = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, algo); + fwd_result = search::Find(args, exhaustive_search, deterministic, ctx); + workspace_size = search::GetWorkspaceSize(args, fwd_result.algo); #endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) @@ -328,7 +326,7 @@ void ConvCudnnKernel(const Context& ctx, // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ // FWD_ALGO_IMPLICIT_GEMM manually. if (groups > 1) { - algo = static_cast(0); + fwd_result.algo = static_cast(0); } #endif @@ -352,7 +350,7 @@ void ConvCudnnKernel(const Context& ctx, args.wdesc.desc(), filter_data, args.cdesc.desc(), - algo, + fwd_result.algo, &beta, args.odesc.desc(), output_data, @@ -373,7 +371,7 @@ void ConvCudnnKernel(const Context& ctx, args.wdesc.desc(), filter_data + i * group_offset_filter, args.cdesc.desc(), - algo, + fwd_result.algo, workspace_ptr, workspace_size, &beta, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index 2893bd74b1..601ac43eee 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -188,11 +188,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, dtype}; #ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t data_algo{}; - miopenConvBwdWeightsAlgorithm_t filter_algo{}; + paddle::operators::SearchResult fwd_result; + paddle::operators::SearchResult + filter_result; #else - cudnnConvolutionFwdAlgo_t data_algo{}; - cudnnConvolutionBwdFilterAlgo_t filter_algo{}; + paddle::operators::SearchResult fwd_result; + paddle::operators::SearchResult + filter_result; #endif auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); @@ -218,14 +220,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, using search1 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = + fwd_result.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - data_algo = search1::Find(args1, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); + fwd_result = search1::Find(args1, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); #endif } @@ -245,14 +247,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, using search2 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = + filter_result.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - filter_algo = search2::Find(args2, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); + filter_result = search2::Find(args2, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); #endif } @@ -278,7 +280,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args1.wdesc.desc(), filter_data + filter_offset * g, args1.cdesc.desc(), - data_algo, + fwd_result.algo, &beta, args1.odesc.desc(), dx_data + x_offset * g, @@ -295,7 +297,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args1.wdesc.desc(), filter_data + filter_offset * g, args1.cdesc.desc(), - data_algo, + fwd_result.algo, cudnn_workspace, workspace_size, &beta, @@ -338,7 +340,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args2.idesc.desc(), dout_data + dout_offset * g, args2.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args2.wdesc.desc(), dfilter_data + filter_offset * g, @@ -355,7 +357,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, args2.odesc.desc(), x_data + x_offset * g, args2.cdesc.desc(), - filter_algo, + filter_result.algo, cudnn_workspace, workspace_size, &beta, @@ -653,22 +655,17 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dilations_, dtype}; #ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t bwd_algo1 = - static_cast(0); - miopenConvBwdDataAlgorithm_t bwd_algo2 = - static_cast(0); - miopenConvFwdAlgorithm_t data_algo = static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result1; + paddle::operators::SearchResult bwd_result2; + paddle::operators::SearchResult + filter_result; + paddle::operators::SearchResult fwd_result; #else - cudnnConvolutionBwdDataAlgo_t bwd_algo1 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t bwd_algo2 = - static_cast(0); - cudnnConvolutionFwdAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); + paddle::operators::SearchResult bwd_result1; + paddle::operators::SearchResult bwd_result2; + paddle::operators::SearchResult + filter_result; + paddle::operators::SearchResult fwd_result; #endif auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); @@ -696,13 +693,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search1 = paddle::operators::SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = + bwd_result1.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = paddle::operators::SearchAlgorithm; - bwd_algo1 = search1::Find(args1, false, deterministic, ctx); - workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); + bwd_result1 = search1::Find(args1, false, deterministic, ctx); + workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); #endif ddfilter_ = ddfilter.data(); @@ -720,14 +717,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search2 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = + bwd_result2.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = paddle::operators::SearchAlgorithm; - bwd_algo2 = search2::Find(args2, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); + bwd_result2 = search2::Find(args2, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); #endif } @@ -736,9 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args3.handle = handle; args3.idesc.set(transformed_dout, iwo_group); args3.wdesc.set(*dfilter, layout, iwo_group); - args3.odesc.set(transformed_ddx_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, @@ -749,14 +744,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search3 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = + filter_result.algo = search3::Find(args3, false, deterministic, workspace_size, ctx); #else using search3 = paddle::operators::SearchAlgorithm; - filter_algo = search3::Find(args3, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); + filter_result = search3::Find(args3, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif } @@ -777,14 +772,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( using search4 = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = + fwd_result.algo = search4::Find(args4, false, deterministic, workspace_size, ctx); #else using search4 = paddle::operators::SearchAlgorithm; - data_algo = search4::Find(args4, false, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); + fwd_result = search4::Find(args4, false, deterministic, ctx); + workspace_size = std::max( + workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); #endif } @@ -831,7 +826,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args1.wdesc.desc(), filter_ + i * group_offset_filter, args1.cdesc.desc(), - bwd_algo1, + bwd_result1.algo, &beta, args1.idesc.desc(), transformed_ddout_channel_ + i * group_offset_out, @@ -850,7 +845,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args1.odesc.desc(), ddx_ + i * group_offset_in, args1.cdesc.desc(), - bwd_algo1, + bwd_result1.algo, workspace_ptr, workspace_size, &beta, @@ -877,7 +872,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args2.wdesc.desc(), ddfilter_ + i * group_offset_filter, args2.cdesc.desc(), - bwd_algo2, + bwd_result2.algo, &beta, args2.idesc.desc(), conv_x_ddfilter_data + i * group_offset_out, @@ -908,7 +903,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args2.odesc.desc(), x_ + i * group_offset_in, args2.cdesc.desc(), - bwd_algo2, + bwd_result2.algo, workspace_ptr, workspace_size, &alpha, @@ -964,7 +959,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args3.idesc.desc(), transformed_dout_channel_ + i * group_offset_out, args3.cdesc.desc(), - filter_algo, + filter_result.algo, &beta, args3.wdesc.desc(), dfilter_ + i * group_offset_filter, @@ -983,7 +978,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args3.odesc.desc(), ddx_ + i * group_offset_in, args3.cdesc.desc(), - filter_algo, + filter_result.algo, workspace_ptr, workspace_size, &beta, @@ -1009,7 +1004,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args4.wdesc.desc(), ddfilter_ + i * group_offset_filter, args4.cdesc.desc(), - data_algo, + fwd_result.algo, &beta, args4.odesc.desc(), transformed_dx_ + i * group_offset_in, @@ -1028,7 +1023,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args4.wdesc.desc(), ddfilter_ + i * group_offset_filter, args4.cdesc.desc(), - data_algo, + fwd_result.algo, workspace_ptr, workspace_size, &beta, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 5de2df4a70..ce02a00162 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -217,16 +217,19 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, c_groups); #ifdef PADDLE_WITH_HIP + paddle::operators::SearchResult bwd_result; using search = paddle::operators::SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find(args, false, deterministic, workspace_size, ctx); + bwd_result.algo = + search::Find(args, false, deterministic, workspace_size, ctx); #else + paddle::operators::SearchResult bwd_result; using search = paddle::operators::SearchAlgorithm; - algo = search::Find(args, false, deterministic, ctx); + bwd_result = search::Find(args, false, deterministic, ctx); workspace_size = - std::max(workspace_size, search::GetWorkspaceSize(args, algo)); + std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); #endif // ------------------- cudnn conv transpose forward --------------------- @@ -247,7 +250,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, args.wdesc.desc(), filter_data + filter_offset * g, args.cdesc.desc(), - algo, + bwd_result.algo, &beta, args.idesc.desc(), transformed_out_data + out_offset * g, @@ -264,7 +267,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, args.odesc.desc(), x_data + x_offset * g, args.cdesc.desc(), - algo, + bwd_result.algo, cudnn_workspace, workspace_size, &beta, diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h index 93bc5b64ad..5cf59fe019 100644 --- a/paddle/phi/kernels/impl/conv_cudnn_impl.h +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -36,7 +36,7 @@ #include "paddle/phi/kernels/funcs/batch_norm_utils.h" DECLARE_bool(cudnn_deterministic); -DECLARE_uint64(conv_workspace_size_limit); +DECLARE_int64(conv_workspace_size_limit); DECLARE_bool(cudnn_exhaustive_search); namespace phi { diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py index 9fad1eeb5c..1c08811d4b 100644 --- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py @@ -43,6 +43,16 @@ def static_program(net, data): return loss +def set_flags(enable_autotune): + if paddle.is_compiled_with_cuda(): + if enable_autotune: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1}) + paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 1}) + else: + paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512}) + paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 0}) + + class TestAutoTune(unittest.TestCase): def test_autotune(self): paddle.fluid.core.disable_autotune() @@ -61,6 +71,7 @@ class TestAutoTune(unittest.TestCase): class TestDygraphAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): + set_flags(enable_autotune) if enable_autotune: paddle.fluid.core.enable_autotune() else: @@ -107,6 +118,7 @@ class TestDygraphAutoTuneStatus(TestAutoTune): class TestStaticAutoTuneStatus(TestAutoTune): def run_program(self, enable_autotune): paddle.enable_static() + set_flags(enable_autotune) if enable_autotune: paddle.fluid.core.enable_autotune() else: -- GitLab