提交 c63bce8a 编写于 作者: Z zhangting2020

tune algo only when dtype is float16

上级 62eab2dc
......@@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
return out;
}
inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) {
int max_algos = 0;
#if CUDNN_VERSION_MIN(7, 0, 1)
PADDLE_ENFORCE_CUDA_SUCCESS(
......@@ -102,38 +102,23 @@ inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
}
template <typename PerfType, typename AlgoType>
void AlgoFinalSelect(const std::vector<PerfType>& perf_results,
std::string kernel_name, int32_t algo_preference,
size_t workspace_byte,
cudnnConvolutionBwdFilterAlgo_t* algo,
bool deterministic) {
// Determine the fastest acceptable algo that matches the algo_preference (-1
// = any),
// regardless of mathType.
VLOG(3) << "=========Full results of algo=========" << kernel_name << ":";
void ChooseAlgo(const std::vector<PerfType>& perf_results,
size_t workspace_byte, AlgoType* algo) {
VLOG(3) << "=========BwdFilterAlgo Perf result=========";
for (const auto& result : perf_results) {
auto math_type_str = "-";
auto math_type_str = "0";
if (result.mathType == CUDNN_TENSOR_OP_MATH) {
math_type_str = "+";
math_type_str = "1";
}
VLOG(3) << " algo: " << result.algo << ", TC" << math_type_str
VLOG(3) << " algo: " << result.algo << ", TC: " << math_type_str
<< ", time: " << result.time << " ms"
<< ", wksp = " << result.memory << ", status = " << result.status;
}
for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
for (size_t i = 0; i != perf_results.size(); ++i) {
const auto& result = perf_results[i];
bool algo_is_tensor_core = false;
algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
bool algo_exclusion = 0;
if (result.status == CUDNN_STATUS_SUCCESS &&
(!deterministic ||
result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
(result.memory <= workspace_byte) &&
(algo_preference == -1 || algo_preference == result.algo) &&
!algo_exclusion) {
(result.memory <= workspace_byte)) {
if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
(i != perf_results.size() - 1)) {
const auto& next_result = perf_results[i + 1];
......@@ -143,16 +128,17 @@ void AlgoFinalSelect(const std::vector<PerfType>& perf_results,
next_result.mathType != CUDNN_TENSOR_OP_MATH &&
next_result.time < 1.01 * result.time) {
// Skip over this result- it's not really a Tensor Core algo.
// Prefer instead the next equivalent non-Tensor Core algo.
// Because it is only 1% performance difference.
// Prefer to choose the next equivalent non-Tensor Core algo.
continue;
}
}
*algo = result.algo;
auto math_type_str = "-";
auto math_type_str = "0";
if (result.mathType == CUDNN_TENSOR_OP_MATH) {
math_type_str = "+";
math_type_str = "1";
}
VLOG(3) << " choose algo: " << result.algo << ", TC" << math_type_str
VLOG(3) << " choose algo: " << result.algo << ", TC: " << math_type_str
<< ", time: " << result.time << " ms"
<< ", wksp = " << result.memory << ", status = " << result.status;
return;
......@@ -443,8 +429,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
bool deterministic,
const framework::ExecutionContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type;
// bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
bool exhaustive = exhaustive_search;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
bool has_got_workspace_size = true;
......@@ -465,9 +449,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
#endif
algo_t algo;
if (!exhaustive && !deterministic) {
if (!exhaustive_search && !deterministic) {
#if CUDNN_VERSION >= 7001
VLOG(3) << "=====Not exhaustive=====";
using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
int perf_count;
int best_algo_idx = 0;
......@@ -494,7 +477,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
} else if (deterministic) {
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} else {
VLOG(3) << "=======exhaustive=======: " << exhaustive;
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
......@@ -507,62 +489,58 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
<< args.s << ", args.p" << args.p << ", args.d" << args.d;
/*
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() {
int returned_algo_count;
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
if (dtype != CUDNN_DATA_HALF) {
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() {
int returned_algo_count;
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(),
args.odesc.desc(), args.o->data<T>(),
args.cdesc.desc(), args.wdesc.desc(),
const_cast<T*>(args.w->data<T>()),
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func,
workspace_size_limit);
VLOG(3)
<< "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return perf_stat[0].algo;
});
} else {
auto max_algos = MaxBwdFilterAlgos(args.handle);
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() {
algo_t chosen_algo;
std::vector<perf_t> perf_results(max_algos);
int actual_algos = 0;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(),
args.odesc.desc(), args.o->data<T>(),
cudnnFindConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(),
const_cast<T*>(args.w->data<T>()),
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return perf_stat[0].algo;
});
*/
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() {
algo_t sel_algo;
auto max_bwd_filt_algos = MaxBackwardFilterAlgos(args.handle);
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(
max_bwd_filt_algos);
int actual_bwd_filter_algos = 0;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(),
bwd_filt_results.size(), &actual_bwd_filter_algos,
bwd_filt_results.data()));
bwd_filt_results.resize(actual_bwd_filter_algos);
AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t,
cudnnConvolutionBwdFilterAlgo_t>(
bwd_filt_results, "backprop-to-filter", -1,
workspace_size_limit, &sel_algo, deterministic);
workspace_size = GetWorkspaceSize(args, sel_algo);
if (workspace_size > workspace_size_limit) {
workspace_size = workspace_size_limit;
}
return sel_algo;
});
perf_results.size(), &actual_algos,
perf_results.data()));
perf_results.resize(actual_algos);
ChooseAlgo<perf_t, algo_t>(perf_results, workspace_size_limit,
&chosen_algo);
return chosen_algo;
});
}
}
VLOG(3) << "choose algo " << algo;
return algo;
}
......
......@@ -336,11 +336,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
int groups = ctx.Attr<int>("groups");
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
VLOG(3) << "=====exhaustive_search====: " << exhaustive_search;
VLOG(3) << "====FLAGS_cudnn_exhaustive_search====: "
<< FLAGS_cudnn_exhaustive_search;
VLOG(3) << "====Attr: exhaustive_search====: "
<< ctx.Attr<bool>("exhaustive_search");
bool deterministic = FLAGS_cudnn_deterministic;
if (exhaustive_search && deterministic) {
PADDLE_THROW(
......
......@@ -185,7 +185,8 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnCTCLoss); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
__macro(cudnnGetConvolutionForwardAlgorithm_v7);
__macro(cudnnGetConvolutionForwardAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
......@@ -195,8 +196,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册