未验证 提交 d5cc144c 编写于 作者: Z Zhang Ting 提交者: GitHub

tune backward filter algorithm for float16 (#27529)

* use exhaustive_search for float16

* tune algo only when dtype is float16
上级 05677818
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <memory> #include <memory>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/framework/operator_kernel_configs.h"
...@@ -90,6 +91,61 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) { ...@@ -90,6 +91,61 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
return out; return out;
} }
inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) {
int max_algos = 0;
#if CUDNN_VERSION_MIN(7, 0, 1)
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
cudnn_handle, &max_algos));
#endif
return max_algos;
}
template <typename PerfType, typename AlgoType>
void ChooseAlgo(const std::vector<PerfType>& perf_results,
size_t workspace_byte, AlgoType* algo) {
VLOG(3) << "=========BwdFilterAlgo Perf result=========";
for (const auto& result : perf_results) {
auto math_type_str = "False";
if (result.mathType == CUDNN_TENSOR_OP_MATH) {
math_type_str = "True";
}
VLOG(3) << " algo: " << result.algo << ", TensorCore: " << math_type_str
<< ", time: " << result.time << " ms"
<< ", wksp = " << result.memory << ", status = " << result.status;
}
for (size_t i = 0; i != perf_results.size(); ++i) {
const auto& result = perf_results[i];
if (result.status == CUDNN_STATUS_SUCCESS &&
(result.memory <= workspace_byte)) {
if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
(i != perf_results.size() - 1)) {
const auto& next_result = perf_results[i + 1];
if (next_result.status == CUDNN_STATUS_SUCCESS &&
next_result.algo == result.algo &&
next_result.memory == result.memory &&
next_result.mathType != CUDNN_TENSOR_OP_MATH &&
next_result.time < 1.01 * result.time) {
// Skip over this result- it's not really a Tensor Core algo.
// Because it is only 1% performance difference.
// Prefer to choose the next equivalent non-Tensor Core algo.
continue;
}
}
*algo = result.algo;
auto math_type_str = "0";
if (result.mathType == CUDNN_TENSOR_OP_MATH) {
math_type_str = "1";
}
VLOG(3) << " choose algo: " << result.algo << ", TC: " << math_type_str
<< ", time: " << result.time << " ms"
<< ", wksp = " << result.memory << ", status = " << result.status;
return;
}
}
}
using framework::ConvSearchCache; using framework::ConvSearchCache;
struct ConvArgs { struct ConvArgs {
...@@ -401,7 +457,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -401,7 +457,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
bool deterministic, bool deterministic,
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type; auto dtype = platform::CudnnDataType<T>::type;
bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0; size_t workspace_size = 0;
bool has_got_workspace_size = true; bool has_got_workspace_size = true;
...@@ -422,7 +477,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -422,7 +477,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
#endif #endif
algo_t algo; algo_t algo;
if (!exhaustive && !deterministic) { if (!exhaustive_search && !deterministic) {
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
int perf_count; int perf_count;
...@@ -462,34 +517,57 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -462,34 +517,57 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
<< args.s << ", args.p" << args.p << ", args.d" << args.d; << args.s << ", args.p" << args.p << ", args.d" << args.d;
if (dtype != CUDNN_DATA_HALF) {
algo = algo_cache.GetAlgorithm( algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0, x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() { static_cast<int64_t>(args.cudnn_dtype), [&]() {
int returned_algo_count; int returned_algo_count;
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat; std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(),
args.odesc.desc(), args.o->data<T>(),
args.cdesc.desc(), args.wdesc.desc(),
const_cast<T*>(args.w->data<T>()),
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func,
workspace_size_limit);
VLOG(3)
<< "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return perf_stat[0].algo;
});
} else {
auto max_algos = MaxBwdFilterAlgos(args.handle);
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() {
algo_t chosen_algo;
std::vector<perf_t> perf_results(max_algos);
int actual_algos = 0;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload:: platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx( cudnnFindConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.x->data<T>(), args.handle, args.idesc.desc(), args.odesc.desc(),
args.odesc.desc(), args.o->data<T>(),
args.cdesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.wdesc.desc(),
const_cast<T*>(args.w->data<T>()), perf_results.size(), &actual_algos,
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, perf_results.data()));
perf_stat.data(), cudnn_workspace_ptr, perf_results.resize(actual_algos);
workspace_size_limit)); ChooseAlgo<perf_t, algo_t>(perf_results, workspace_size_limit,
}; &chosen_algo);
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); return chosen_algo;
});
VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; }
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return perf_stat[0].algo;
});
} }
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
return algo; return algo;
......
...@@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnGetVersion); \ __macro(cudnnGetVersion); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \ __macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithm); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString); \ __macro(cudnnGetErrorString); \
__macro(cudnnCreateDropoutDescriptor); \ __macro(cudnnCreateDropoutDescriptor); \
...@@ -177,7 +178,8 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -177,7 +178,8 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnCTCLoss); \ __macro(cudnnCTCLoss); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \ __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \ __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
__macro(cudnnGetConvolutionForwardAlgorithm_v7); __macro(cudnnGetConvolutionForwardAlgorithm_v7); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册