未验证 提交 c331e2ce 编写于 作者: Y Yiqun Liu 提交者: GitHub

Define ConvRunner to wrapper the call of cudnn conv functions. (#47576)

* Define ConvRunner to wrapper the call of cudnn conv functions.

* Use ConvKind in SearchAlgorithm.
上级 fa874a46
......@@ -115,7 +115,7 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
}
}
template <typename PerfT>
template <ConvKind CK>
struct SearchAlgorithmBase {};
// cuDNN convolution forward algorithm searcher, consisted of three searching
......@@ -123,9 +123,10 @@ struct SearchAlgorithmBase {};
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
template <>
struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
struct SearchAlgorithmBase<ConvKind::kForward> {
using PerfT = cudnnConvolutionFwdAlgoPerf_t;
using AlgoT = cudnnConvolutionFwdAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvForward;
......@@ -296,9 +297,10 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
template <>
struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
struct SearchAlgorithmBase<ConvKind::kBackwardData> {
using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdDataAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardData;
......@@ -478,9 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
template <>
struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardFilter;
......@@ -684,9 +687,9 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
};
template <typename PerfT>
struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;
template <ConvKind CK>
struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
using AlgoT = typename SearchAlgorithmBase<CK>::AlgoT;
template <typename T>
static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
......@@ -700,7 +703,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
result = SearchAlgorithmBase<CK>::FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, run heuristic (default) before
......@@ -710,7 +713,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
// default mode for the rest.
auto key = args.ConvertToConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
SearchAlgorithmBase<PerfT>::kAlgoType);
SearchAlgorithmBase<CK>::kAlgoType);
bool find_in_cache = cache.Find(key);
if (find_in_cache) {
auto t = cache.Get(key);
......@@ -727,7 +730,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
// Once autotune is enabled, the autotuned result can rewrite the
// previous result in cache found by heuristic method.
result =
SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
SearchAlgorithmBase<CK>::template FindAlgoExhaustiveSearch<T>(
args, ctx);
cache.Set(key,
phi::autotune::ConvAutoTuneResult(
......@@ -735,7 +738,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result.workspace_size,
true));
} else if (!find_in_cache) {
result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
result = SearchAlgorithmBase<CK>::FindAlgoHeuristic(args, ctx);
cache.Set(key,
phi::autotune::ConvAutoTuneResult(
static_cast<int64_t>(result.algo),
......@@ -744,7 +747,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
}
}
}
VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
VLOG(3) << "[cuDNN " << SearchAlgorithmBase<CK>::GetPerfName()
<< "] exhaustive_search=" << exhaustive_search
<< ", use_autotune=" << use_autotune
<< ", deterministic=" << deterministic
......@@ -783,4 +786,138 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
}
};
template <typename T, ConvKind CK>
struct ConvRunner {};
template <typename T>
struct ConvRunner<T, ConvKind::kForward> {
static void Apply(
const phi::GPUContext& ctx,
const ConvArgs& args,
const SearchResult<cudnnConvolutionFwdAlgo_t>& search_result,
const T* input_ptr,
const T* filter_ptr,
T* output_ptr,
int groups,
int group_offset_in,
int group_offset_filter,
int group_offset_out,
size_t workspace_size,
phi::DnnWorkspaceHandle* workspace_handle,
bool use_addto = false) {
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
auto cudnn_handle = ctx.cudnn_handle();
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnConvolutionForward(
cudnn_handle,
&alpha,
args.idesc.desc(),
input_ptr + i * group_offset_in,
args.wdesc.desc(),
filter_ptr + i * group_offset_filter,
args.cdesc.desc(),
search_result.algo,
workspace_ptr,
workspace_size,
&beta,
args.odesc.desc(),
output_ptr + i * group_offset_out));
},
workspace_size);
}
}
};
template <typename T>
struct ConvRunner<T, ConvKind::kBackwardData> {
static void Apply(
const phi::GPUContext& ctx,
const ConvArgs& args,
const SearchResult<cudnnConvolutionBwdDataAlgo_t>& search_result,
const T* output_grad_ptr,
const T* filter_ptr,
T* input_grad_ptr,
int groups,
int group_offset_in,
int group_offset_filter,
int group_offset_out,
size_t workspace_size,
phi::DnnWorkspaceHandle* workspace_handle,
bool use_addto = false) {
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
auto cudnn_handle = ctx.cudnn_handle();
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnConvolutionBackwardData(
cudnn_handle,
&alpha,
args.wdesc.desc(),
filter_ptr + i * group_offset_filter,
args.odesc.desc(),
output_grad_ptr + i * group_offset_out,
args.cdesc.desc(),
search_result.algo,
workspace_ptr,
workspace_size,
&beta,
args.idesc.desc(),
input_grad_ptr + i * group_offset_in));
},
workspace_size);
}
}
};
template <typename T>
struct ConvRunner<T, ConvKind::kBackwardFilter> {
static void Apply(
const phi::GPUContext& ctx,
const ConvArgs& args,
const SearchResult<cudnnConvolutionBwdFilterAlgo_t>& search_result,
const T* output_grad_ptr,
const T* input_ptr,
T* filter_grad_ptr,
int groups,
int group_offset_in,
int group_offset_filter,
int group_offset_out,
size_t workspace_size,
phi::DnnWorkspaceHandle* workspace_handle,
bool use_addto = false) {
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
auto cudnn_handle = ctx.cudnn_handle();
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnConvolutionBackwardFilter(
cudnn_handle,
&alpha,
args.idesc.desc(),
input_ptr + i * group_offset_in,
args.odesc.desc(),
output_grad_ptr + i * group_offset_out,
args.cdesc.desc(),
search_result.algo,
workspace_ptr,
workspace_size,
&beta,
args.wdesc.desc(),
filter_grad_ptr + i * group_offset_filter));
},
workspace_size);
}
}
};
} // namespace phi
......@@ -34,7 +34,9 @@ template <typename T>
using ScalingParamType =
typename paddle::platform::CudnnDataType<T>::ScalingParamType;
// As the container of searchAlgorithm::Find() result.
enum class ConvKind { kForward = 1, kBackwardData = 2, kBackwardFilter = 3 };
// The container of SearchAlgorithm::Find() result.
template <typename AlgoT>
struct SearchResult {
SearchResult() {}
......
......@@ -376,7 +376,7 @@ void ConvCudnnGradKernel(const Context& ctx,
bwd_result.algo = search1::Find<T>(
args1, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
workspace_size = std::max(workspace_size, bwd_result.workspace_size);
#endif
......@@ -401,7 +401,7 @@ void ConvCudnnGradKernel(const Context& ctx,
filter_result.algo = search2::Find<T>(
args2, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result =
search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
VLOG(3) << "filter algo: " << filter_result.algo << ", time "
......@@ -481,30 +481,22 @@ void ConvCudnnGradKernel(const Context& ctx,
},
workspace_size);
}
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args1.wdesc.desc(),
filter_data + i * group_offset_filter,
args1.odesc.desc(),
output_grad_data + i * group_offset_out,
args1.cdesc.desc(),
bwd_result.algo,
cudnn_workspace_ptr,
ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
args1,
bwd_result,
output_grad_data,
filter_data,
transformed_input_grad_data,
groups,
group_offset_in,
group_offset_filter,
group_offset_out,
workspace_size,
&beta,
args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in));
},
workspace_size);
}
&workspace_handle,
use_addto);
#endif
if (!is_sys_pad) {
std::vector<int> starts(transformed_input_channel.dims().size(), 0);
std::vector<int> axes(transformed_input_channel.dims().size(), 0);
......@@ -536,8 +528,6 @@ void ConvCudnnGradKernel(const Context& ctx,
}
}
// filter_grad do not use inplace addto.
ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad.
......@@ -562,27 +552,19 @@ void ConvCudnnGradKernel(const Context& ctx,
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args2.idesc.desc(),
input_data + i * group_offset_in,
args2.odesc.desc(),
output_grad_data + i * group_offset_out,
args2.cdesc.desc(),
filter_result.algo,
cudnn_workspace_ptr,
ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
args2,
filter_result,
output_grad_data,
input_data,
filter_grad_data,
groups,
group_offset_in,
group_offset_filter,
group_offset_out,
workspace_size,
&beta_filter,
args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
},
workspace_size);
}
&workspace_handle,
false);
#endif
if (compute_format == paddle::platform::DataLayout::kNHWC) {
......@@ -952,7 +934,7 @@ void ConvCudnnGradGradKernel(
fwd_result1.algo = search1::Find<T>(
args1, exhaustive_search, false, workspace_size, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search1 = SearchAlgorithm<ConvKind::kForward>;
fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
#endif
......@@ -977,7 +959,7 @@ void ConvCudnnGradGradKernel(
fwd_result2.algo = search2::Find<T>(
args2, exhaustive_search, false, workspace_size, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search2 = SearchAlgorithm<ConvKind::kForward>;
fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
......@@ -1003,7 +985,7 @@ void ConvCudnnGradGradKernel(
filter_result.algo = search3::Find<T>(
args3, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result =
search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
workspace_size = std::max(
......@@ -1030,7 +1012,7 @@ void ConvCudnnGradGradKernel(
data_result.algo = search4::Find<T>(
args4, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search4 = SearchAlgorithm<ConvKind::kBackwardData>;
data_result =
search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
workspace_size = std::max(
......@@ -1088,27 +1070,19 @@ void ConvCudnnGradGradKernel(
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
handle,
&alpha,
args1.idesc.desc(),
ddx + i * group_offset_in,
args1.wdesc.desc(),
w + i * group_offset_filter,
args1.cdesc.desc(),
fwd_result1.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kForward>::Apply(ctx,
args1,
fwd_result1,
ddx,
w,
transformed_ddy_channel,
groups,
group_offset_in,
group_offset_filter,
group_offset_out,
workspace_size,
&beta,
args1.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
&workspace_handle,
false);
#endif
}
if (ddW) {
......@@ -1134,27 +1108,19 @@ void ConvCudnnGradGradKernel(
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
handle,
&alpha,
args2.idesc.desc(),
x + i * group_offset_in,
args2.wdesc.desc(),
ddw + i * group_offset_filter,
args2.cdesc.desc(),
fwd_result2.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kForward>::Apply(ctx,
args2,
fwd_result2,
x,
ddw,
transformed_ddy_channel,
groups,
group_offset_in,
group_offset_filter,
group_offset_out,
workspace_size,
&alpha,
args2.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
&workspace_handle,
true);
#endif
}
if (channel_last) {
......@@ -1185,27 +1151,19 @@ void ConvCudnnGradGradKernel(
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args3.idesc.desc(),
ddx + i * group_offset_in,
args3.odesc.desc(),
transformed_dy_channel + i * group_offset_out,
args3.cdesc.desc(),
filter_result.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
args3,
filter_result,
transformed_dy_channel,
ddx,
dw,
groups,
group_offset_in,
group_offset_filter,
group_offset_out,
workspace_size,
&beta,
args3.wdesc.desc(),
dw + i * group_offset_filter));
},
workspace_size);
}
&workspace_handle,
false);
#endif
}
......@@ -1232,27 +1190,19 @@ void ConvCudnnGradGradKernel(
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args4.wdesc.desc(),
ddw + i * group_offset_filter,
args4.odesc.desc(),
transformed_dy_channel + i * group_offset_out,
args4.cdesc.desc(),
data_result.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
args4,
data_result,
transformed_dy_channel,
ddw,
transformed_dx,
groups,
group_offset_in,
group_offset_filter,
group_offset_out,
workspace_size,
&beta,
args4.idesc.desc(),
transformed_dx + i * group_offset_in));
},
workspace_size);
}
&workspace_handle,
false);
#endif
if (!is_sys_pad) {
......
......@@ -315,7 +315,7 @@ void ConvCudnnKernel(const Context& ctx,
args, exhaustive_search, deterministic, workspace_size, ctx);
#else
SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search = SearchAlgorithm<ConvKind::kForward>;
fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
workspace_size = fwd_result.workspace_size;
#endif
......@@ -359,27 +359,19 @@ void ConvCudnnKernel(const Context& ctx,
},
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
handle,
&alpha,
args.idesc.desc(),
input_data + i * group_offset_in,
args.wdesc.desc(),
filter_data + i * group_offset_filter,
args.cdesc.desc(),
fwd_result.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kForward>::Apply(ctx,
args,
fwd_result,
input_data,
filter_data,
output_data,
groups,
group_offset_in,
group_offset_filter,
group_offset_out,
workspace_size,
&beta,
args.odesc.desc(),
output_data + i * group_offset_out));
},
workspace_size);
}
&workspace_handle,
false);
#endif
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
......
......@@ -227,7 +227,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
fwd_result.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search1 = SearchAlgorithm<ConvKind::kForward>;
fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = std::max(
workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
......@@ -252,7 +252,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
filter_result.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
......@@ -269,9 +269,9 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle();
if (dx) {
#ifdef PADDLE_WITH_HIP
// Because beta is zero, it is unnecessary to reset dx.
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenConvolutionForward(handle,
......@@ -288,26 +288,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
cudnn_workspace,
workspace_size));
};
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cudnnConvolutionForward(handle,
&alpha,
args1.idesc.desc(),
dout_data + dout_offset * g,
args1.wdesc.desc(),
filter_data + filter_offset * g,
args1.cdesc.desc(),
fwd_result.algo,
cudnn_workspace,
ConvRunner<T, ConvKind::kForward>::Apply(ctx,
args1,
fwd_result,
dout_data,
filter_data,
dx_data,
groups,
dout_offset,
filter_offset,
x_offset,
workspace_size,
&beta,
args1.odesc.desc(),
dx_data + x_offset * g));
};
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (data_layout == GPUDNNDataLayout::kNHWC) {
DenseTensor dx_transpose;
......@@ -330,8 +327,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
if (dfilter) {
// Because beta is zero, it is unnecessary to reset dfilter.
// Gradient with respect to the filter
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
handle,
......@@ -348,26 +345,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
cudnn_workspace,
workspace_size));
};
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args2.idesc.desc(),
dout_data + dout_offset * g,
args2.odesc.desc(),
x_data + x_offset * g,
args2.cdesc.desc(),
filter_result.algo,
cudnn_workspace,
ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
args2,
filter_result,
x_data,
dout_data,
dfilter_data,
groups,
dout_offset,
filter_offset,
x_offset,
workspace_size,
&beta,
args2.wdesc.desc(),
dfilter_data + filter_offset * g));
};
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
}
}
......@@ -704,7 +698,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
bwd_result1.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
#endif
......@@ -726,7 +720,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
bwd_result2.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
......@@ -751,7 +745,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
filter_result.algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
#else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
......@@ -777,7 +771,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
fwd_result.algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
#else
using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
using search4 = SearchAlgorithm<ConvKind::kForward>;
fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
......@@ -815,8 +809,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
if (ddout) {
ddx_ = transformed_ddx.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
......@@ -835,30 +829,25 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
workspace_size));
},
workspace_size);
}
#else // PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args1.wdesc.desc(),
filter_ + i * group_offset_filter,
args1.odesc.desc(),
ddx_ + i * group_offset_in,
args1.cdesc.desc(),
bwd_result1.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
args1,
bwd_result1,
ddx_,
filter_,
transformed_ddout_channel_,
groups,
group_offset_out,
group_offset_filter,
group_offset_in,
workspace_size,
&beta,
args1.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
},
workspace_size);
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
}
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
// MIOPEN ONLY support beta to be 0.0f
DenseTensor conv_x_ddfilter(dout.type());
conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
......@@ -893,27 +882,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
&beta,
args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
}
#else // PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args2.wdesc.desc(),
ddfilter_ + i * group_offset_filter,
args2.odesc.desc(),
x_ + i * group_offset_in,
args2.cdesc.desc(),
bwd_result2.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
args2,
bwd_result2,
x_,
ddfilter_,
transformed_ddout_channel_,
groups,
group_offset_out,
group_offset_filter,
group_offset_in,
workspace_size,
&alpha,
args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
},
workspace_size);
&workspace_handle,
true);
#endif // PADDLE_WITH_HIP
}
if ((!is_sys_pad) && (!channel_last)) {
if (strides.size() == 2U) {
......@@ -947,8 +931,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
T* transformed_dout_channel_ = transformed_dout.data<T>();
if (dfilter) {
ddx_ = transformed_ddx_channel.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
......@@ -968,33 +952,28 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
workspace_size));
},
workspace_size);
}
#else // PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args3.idesc.desc(),
transformed_dout_channel_ + i * group_offset_out,
args3.odesc.desc(),
ddx_ + i * group_offset_in,
args3.cdesc.desc(),
filter_result.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
args3,
filter_result,
ddx_,
transformed_dout_channel_,
dfilter_,
groups,
group_offset_out,
group_offset_filter,
group_offset_in,
workspace_size,
&beta,
args3.wdesc.desc(),
dfilter_ + i * group_offset_filter));
},
workspace_size);
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
}
}
if (dx) {
ddfilter_ = ddfilter.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
......@@ -1013,27 +992,23 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
workspace_size));
},
workspace_size);
}
#else // PADDLE_WITH_HIP
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward(
handle,
&alpha,
args4.idesc.desc(),
transformed_dout_channel_ + i * group_offset_out,
args4.wdesc.desc(),
ddfilter_ + i * group_offset_filter,
args4.cdesc.desc(),
fwd_result.algo,
workspace_ptr,
ConvRunner<T, ConvKind::kForward>::Apply(ctx,
args4,
fwd_result,
transformed_dout_channel_,
ddfilter_,
transformed_dx_,
groups,
group_offset_out,
group_offset_filter,
group_offset_in,
workspace_size,
&beta,
args4.odesc.desc(),
transformed_dx_ + i * group_offset_in));
},
workspace_size);
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
}
if (channel_last) {
TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
}
......
......@@ -227,7 +227,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
search::Find<T>(args, false, deterministic, workspace_size, ctx);
#else
SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
using search = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
......@@ -240,8 +240,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
handle,
......@@ -258,26 +258,24 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
cudnn_workspace,
workspace_size));
};
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args.wdesc.desc(),
filter_data + filter_offset * g,
args.odesc.desc(),
x_data + x_offset * g,
args.cdesc.desc(),
bwd_result.algo,
cudnn_workspace,
ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
args,
bwd_result,
x_data,
filter_data,
transformed_out_data,
groups,
out_offset,
filter_offset,
x_offset,
workspace_size,
&beta,
args.idesc.desc(),
transformed_out_data + out_offset * g));
};
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (!is_sys_pad && strides.size() == 2U) {
funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
} else if (!is_sys_pad && strides.size() == 3U) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册