未验证 提交 358bc06c 编写于 作者: Z Zhaolong Xing 提交者: GitHub

[CUDNN8 support] : support CUDNN8 (#25664)

* cunn8 support
test=develop

* fix ci error
test=develop
上级 5970871a
......@@ -60,9 +60,8 @@ else()
set(CUDNN_FOUND OFF)
endif()
if(CUDNN_FOUND)
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
macro(find_cudnn_version cudnn_header_file)
file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS)
get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
......@@ -93,10 +92,15 @@ if(CUDNN_FOUND)
math(EXPR CUDNN_VERSION
"${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
endif()
message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
message(STATUS "Current cuDNN header is ${cudnn_header_file} "
"Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
endif()
endif()
endmacro()
if(CUDNN_FOUND)
find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h)
if (NOT CUDNN_MAJOR_VERSION)
find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h)
endif()
endif()
......@@ -162,19 +162,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) {
has_got_workspace_size = false;
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
}
if (!has_got_workspace_size) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
workspace_size_limit = workspace_size;
}
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
......@@ -303,19 +291,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
#endif
workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) {
workspace_size_limit = workspace_size;
has_got_workspace_size = false;
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
}
if (!has_got_workspace_size) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
}
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
......@@ -432,19 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
algo = (perf_results.get())[best_algo_idx].algo;
workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) {
has_got_workspace_size = false;
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
}
if (!has_got_workspace_size) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
workspace_size = workspace_size_limit;
}
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
......
......@@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(transformed_input.dims());
auto f_dims = framework::vectorize(filter->dims());
if (!exhaustive_search) {
int perf_count;
int best_algo_idx = 0;
size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
perf_results.get()));
algo = (perf_results.get())[best_algo_idx].algo;
VLOG(3) << "cuDNN forward algo " << algo;
} else {
std::function<cudnnConvolutionFwdAlgo_t()> search_func =
......
......@@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
out_strides[i].data()));
int perf_count;
int best_algo_idx = 0;
size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo[i]));
size_t tmp_size = 0;
kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get()));
algo[i] = (perf_results.get())[best_algo_idx].algo;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
algo[i], &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
cudnnActivationDescriptor_t cudnn_act_desc =
......
......@@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
......@@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnSetRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
......@@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 3000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
// APIs available after R3:
#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
// APIs available after R4:
#if CUDNN_VERSION >= 4007
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
......@@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 8000
#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) __macro(cudnnSetRNNDescriptor_v8);
CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册