未验证 提交 f61dfeed 编写于 作者: C chengduo 提交者: GitHub

Merge pull request #10263 from chengduoZH/add_FLAGS_use_deterministic_algo

Add FLAGS_cudnn_algo_use_autotune
......@@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(1) << place_ << "RUN Scale loss grad op";
});
#endif
}
......
......@@ -20,6 +20,11 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
DEFINE_bool(cudnn_algo_use_autotune, true,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"false, the algorithm is deterministic.");
namespace paddle {
namespace operators {
......@@ -267,10 +272,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
if (input_grad) {
if (FLAGS_cudnn_algo_use_autotune) {
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc,
// dyDesc: Handle to the previously initialized input differential
// dyDesc: Handle to the previously initialized input
// differential
// tensor descriptor.
cudnn_output_grad_desc, cudnn_conv_desc,
// dxDesc: Handle to the previously initialized output tensor
......@@ -278,6 +285,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc,
......@@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
if (filter_grad) {
if (FLAGS_cudnn_algo_use_autotune) {
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
cudnn_filter_desc,
handle, cudnn_input_desc, cudnn_output_grad_desc,
cudnn_conv_desc, cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......
......@@ -111,7 +111,9 @@ def __bootstrap__():
'eager_delete_scope'
]
if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use']
read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune'
]
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册