diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 63088d05a54a4c8859ccbae139c680e1f1ac539b..ffcf8a5800ea11ae98bfa321b36af87952a516d9 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -139,9 +139,8 @@ class CUDNNConvOpKernel : public framework::OpKernel { // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; - auto handle = dev_ctx.cudnn_handle(); - bool half_float = false; + #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) // Tensor core is supported since the volta GPU and // is only enabled when input and filter data are float16 @@ -160,9 +159,9 @@ class CUDNNConvOpKernel : public framework::OpKernel { VLOG(5) << "NOT use cudnn_tensor_op_math"; } #endif - Tensor cudnn_workspace; - void* cudnn_workspace_ptr = nullptr; + auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); if ((!exhaustive_search) && (!half_float)) { @@ -174,12 +173,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { } else if (exhaustive_search && (!half_float)) { AlgorithmsCache& algo_cache = ctx.GetKernelConfig>(0); - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_limit)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); algo = algo_cache.GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { @@ -187,13 +180,16 @@ class CUDNNConvOpKernel : public framework::OpKernel { std::array fwd_perf_stat; - CUDNN_ENFORCE( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, - output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - fwd_perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, + output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, + workspace_size_limit)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { @@ -219,14 +215,13 @@ class CUDNNConvOpKernel : public framework::OpKernel { "workspace_size to be allocated exceeds the limit"); // Allocate on GPU memory - if (!cudnn_workspace_ptr) { - cudnn_workspace = - ctx.AllocateTmpTensor( - framework::make_ddim( - {static_cast(workspace_size_in_bytes)}), - dev_ctx); - cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - } + Tensor cudnn_workspace = + ctx.AllocateTmpTensor( + framework::make_ddim( + {static_cast(workspace_size_in_bytes)}), + dev_ctx); + void* cudnn_workspace_ptr = + static_cast(cudnn_workspace.data()); // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index ad24e6682b2274c1b352e2778e7784ac62d57720..87b656d8a990f5bfcbe174b05b32cbf94db21fec 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -18,7 +18,7 @@ limitations under the License. */ DEFINE_int64(cudnn_exhaustive_search_times, -1, "Exhaustive search times for cuDNN convolution, " - "defalut is 1, only search once."); + "defalut is -1, not exhaustive search"); namespace paddle { namespace operators { @@ -132,7 +132,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); }; - workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); VLOG(3) << "Perf result: (algo: stat, time, memory)"; for (int i = 0; i < returned_algo_count; ++i) { const auto& stat = fwd_perf_stat[i]; diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a86fef33b4ca87ad411ed4844ee99ff17597e1a6..812181563e6e55455a5c08a0ba1b7ca343ebf851 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -163,6 +163,15 @@ class CudnnHolder { cudnn_func(WorkspacePtr()); } + /*! \brief Reset workspace thus release the memory */ + inline void ResetWorkspace() { + if (workspace_) { + // Maybe someone is using the current workspace + PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); + workspace_ = nullptr; + } + } + inline void* WorkspacePtr() { if (workspace_) { return workspace_->ptr(); @@ -207,6 +216,22 @@ class CudnnWorkspaceHandle { required_workspace_len); } + /*! \brief Thread which call RunFuncSync() would acquire the lock first + * before invoking cudnn function and release gpu memory after running + * the function. Currently this function is only used when cudnn + * exhaustive searching and callers have to guarantee that the input function + * is host blocking */ + template + inline void RunFuncSync(Callback&& cudnn_func, + size_t required_workspace_len) { + if (!guard_) { + guard_.reset(new std::lock_guard(holder_->Mutex())); + } + holder_->RunFuncImpl(std::forward(cudnn_func), + required_workspace_len); + holder_->ResetWorkspace(); + } + CudnnWorkspaceHandle(CudnnWorkspaceHandle&&) = default; CudnnWorkspaceHandle& operator=(CudnnWorkspaceHandle&&) = delete;