未验证 提交 c331e2ce 编写于 作者: Y Yiqun Liu 提交者: GitHub

Define ConvRunner to wrapper the call of cudnn conv functions. (#47576)

* Define ConvRunner to wrapper the call of cudnn conv functions.

* Use ConvKind in SearchAlgorithm.
上级 fa874a46
...@@ -115,7 +115,7 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results, ...@@ -115,7 +115,7 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
} }
} }
template <typename PerfT> template <ConvKind CK>
struct SearchAlgorithmBase {}; struct SearchAlgorithmBase {};
// cuDNN convolution forward algorithm searcher, consisted of three searching // cuDNN convolution forward algorithm searcher, consisted of three searching
...@@ -123,9 +123,10 @@ struct SearchAlgorithmBase {}; ...@@ -123,9 +123,10 @@ struct SearchAlgorithmBase {};
// As well as one workspace size acquirsition function with respect to // As well as one workspace size acquirsition function with respect to
// the chosen alogrithm. // the chosen alogrithm.
template <> template <>
struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { struct SearchAlgorithmBase<ConvKind::kForward> {
using PerfT = cudnnConvolutionFwdAlgoPerf_t; using PerfT = cudnnConvolutionFwdAlgoPerf_t;
using AlgoT = cudnnConvolutionFwdAlgo_t; using AlgoT = cudnnConvolutionFwdAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType = constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvForward; phi::autotune::AlgorithmType::kConvForward;
...@@ -296,9 +297,10 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -296,9 +297,10 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
// As well as one workspace size acquirsition function with // As well as one workspace size acquirsition function with
// respect to the chosen alogrithm. // respect to the chosen alogrithm.
template <> template <>
struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { struct SearchAlgorithmBase<ConvKind::kBackwardData> {
using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdDataAlgo_t; using AlgoT = cudnnConvolutionBwdDataAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType = constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardData; phi::autotune::AlgorithmType::kConvBackwardData;
...@@ -478,9 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -478,9 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
// exhaustive_search mode. As well as one workspace size acquirsition function // exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm. // with respect to the chosen alogrithm.
template <> template <>
struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdFilterAlgo_t; using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType = constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardFilter; phi::autotune::AlgorithmType::kConvBackwardFilter;
...@@ -684,9 +687,9 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -684,9 +687,9 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
} }
}; };
template <typename PerfT> template <ConvKind CK>
struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT; using AlgoT = typename SearchAlgorithmBase<CK>::AlgoT;
template <typename T> template <typename T>
static SearchResult<AlgoT> Find(const phi::GPUContext& ctx, static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
...@@ -700,7 +703,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -700,7 +703,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
SetConvMathType(ctx, dtype, args.cdesc); SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) { if (deterministic) {
result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args); result = SearchAlgorithmBase<CK>::FindAlgoDeterministic(args);
} else { } else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search. // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, run heuristic (default) before // 2. Once turning on auto-tune, run heuristic (default) before
...@@ -710,7 +713,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -710,7 +713,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
// default mode for the rest. // default mode for the rest.
auto key = args.ConvertToConvCacheKey<T>(); auto key = args.ConvertToConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv( auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
SearchAlgorithmBase<PerfT>::kAlgoType); SearchAlgorithmBase<CK>::kAlgoType);
bool find_in_cache = cache.Find(key); bool find_in_cache = cache.Find(key);
if (find_in_cache) { if (find_in_cache) {
auto t = cache.Get(key); auto t = cache.Get(key);
...@@ -727,7 +730,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -727,7 +730,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
// Once autotune is enabled, the autotuned result can rewrite the // Once autotune is enabled, the autotuned result can rewrite the
// previous result in cache found by heuristic method. // previous result in cache found by heuristic method.
result = result =
SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>( SearchAlgorithmBase<CK>::template FindAlgoExhaustiveSearch<T>(
args, ctx); args, ctx);
cache.Set(key, cache.Set(key,
phi::autotune::ConvAutoTuneResult( phi::autotune::ConvAutoTuneResult(
...@@ -735,7 +738,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -735,7 +738,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result.workspace_size, result.workspace_size,
true)); true));
} else if (!find_in_cache) { } else if (!find_in_cache) {
result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx); result = SearchAlgorithmBase<CK>::FindAlgoHeuristic(args, ctx);
cache.Set(key, cache.Set(key,
phi::autotune::ConvAutoTuneResult( phi::autotune::ConvAutoTuneResult(
static_cast<int64_t>(result.algo), static_cast<int64_t>(result.algo),
...@@ -744,7 +747,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -744,7 +747,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
} }
} }
} }
VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName() VLOG(3) << "[cuDNN " << SearchAlgorithmBase<CK>::GetPerfName()
<< "] exhaustive_search=" << exhaustive_search << "] exhaustive_search=" << exhaustive_search
<< ", use_autotune=" << use_autotune << ", use_autotune=" << use_autotune
<< ", deterministic=" << deterministic << ", deterministic=" << deterministic
...@@ -783,4 +786,138 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> { ...@@ -783,4 +786,138 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
} }
}; };
template <typename T, ConvKind CK>
struct ConvRunner {};
template <typename T>
struct ConvRunner<T, ConvKind::kForward> {
static void Apply(
const phi::GPUContext& ctx,
const ConvArgs& args,
const SearchResult<cudnnConvolutionFwdAlgo_t>& search_result,
const T* input_ptr,
const T* filter_ptr,
T* output_ptr,
int groups,
int group_offset_in,
int group_offset_filter,
int group_offset_out,
size_t workspace_size,
phi::DnnWorkspaceHandle* workspace_handle,
bool use_addto = false) {
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
auto cudnn_handle = ctx.cudnn_handle();
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnConvolutionForward(
cudnn_handle,
&alpha,
args.idesc.desc(),
input_ptr + i * group_offset_in,
args.wdesc.desc(),
filter_ptr + i * group_offset_filter,
args.cdesc.desc(),
search_result.algo,
workspace_ptr,
workspace_size,
&beta,
args.odesc.desc(),
output_ptr + i * group_offset_out));
},
workspace_size);
}
}
};
template <typename T>
struct ConvRunner<T, ConvKind::kBackwardData> {
static void Apply(
const phi::GPUContext& ctx,
const ConvArgs& args,
const SearchResult<cudnnConvolutionBwdDataAlgo_t>& search_result,
const T* output_grad_ptr,
const T* filter_ptr,
T* input_grad_ptr,
int groups,
int group_offset_in,
int group_offset_filter,
int group_offset_out,
size_t workspace_size,
phi::DnnWorkspaceHandle* workspace_handle,
bool use_addto = false) {
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
auto cudnn_handle = ctx.cudnn_handle();
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnConvolutionBackwardData(
cudnn_handle,
&alpha,
args.wdesc.desc(),
filter_ptr + i * group_offset_filter,
args.odesc.desc(),
output_grad_ptr + i * group_offset_out,
args.cdesc.desc(),
search_result.algo,
workspace_ptr,
workspace_size,
&beta,
args.idesc.desc(),
input_grad_ptr + i * group_offset_in));
},
workspace_size);
}
}
};
template <typename T>
struct ConvRunner<T, ConvKind::kBackwardFilter> {
static void Apply(
const phi::GPUContext& ctx,
const ConvArgs& args,
const SearchResult<cudnnConvolutionBwdFilterAlgo_t>& search_result,
const T* output_grad_ptr,
const T* input_ptr,
T* filter_grad_ptr,
int groups,
int group_offset_in,
int group_offset_filter,
int group_offset_out,
size_t workspace_size,
phi::DnnWorkspaceHandle* workspace_handle,
bool use_addto = false) {
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
auto cudnn_handle = ctx.cudnn_handle();
for (int i = 0; i < groups; i++) {
workspace_handle->RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnConvolutionBackwardFilter(
cudnn_handle,
&alpha,
args.idesc.desc(),
input_ptr + i * group_offset_in,
args.odesc.desc(),
output_grad_ptr + i * group_offset_out,
args.cdesc.desc(),
search_result.algo,
workspace_ptr,
workspace_size,
&beta,
args.wdesc.desc(),
filter_grad_ptr + i * group_offset_filter));
},
workspace_size);
}
}
};
} // namespace phi } // namespace phi
...@@ -34,7 +34,9 @@ template <typename T> ...@@ -34,7 +34,9 @@ template <typename T>
using ScalingParamType = using ScalingParamType =
typename paddle::platform::CudnnDataType<T>::ScalingParamType; typename paddle::platform::CudnnDataType<T>::ScalingParamType;
// As the container of searchAlgorithm::Find() result. enum class ConvKind { kForward = 1, kBackwardData = 2, kBackwardFilter = 3 };
// The container of SearchAlgorithm::Find() result.
template <typename AlgoT> template <typename AlgoT>
struct SearchResult { struct SearchResult {
SearchResult() {} SearchResult() {}
......
...@@ -376,7 +376,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -376,7 +376,7 @@ void ConvCudnnGradKernel(const Context& ctx,
bwd_result.algo = search1::Find<T>( bwd_result.algo = search1::Find<T>(
args1, exhaustive_search, deterministic, workspace_size, ctx); args1, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic); bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
workspace_size = std::max(workspace_size, bwd_result.workspace_size); workspace_size = std::max(workspace_size, bwd_result.workspace_size);
#endif #endif
...@@ -401,7 +401,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -401,7 +401,7 @@ void ConvCudnnGradKernel(const Context& ctx,
filter_result.algo = search2::Find<T>( filter_result.algo = search2::Find<T>(
args2, exhaustive_search, deterministic, workspace_size, ctx); args2, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result = filter_result =
search2::Find<T>(ctx, args2, exhaustive_search, deterministic); search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
VLOG(3) << "filter algo: " << filter_result.algo << ", time " VLOG(3) << "filter algo: " << filter_result.algo << ", time "
...@@ -481,30 +481,22 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -481,30 +481,22 @@ void ConvCudnnGradKernel(const Context& ctx,
}, },
workspace_size); workspace_size);
} }
#else #else
for (int i = 0; i < groups; i++) { ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
workspace_handle.RunFunc( args1,
[&](void* cudnn_workspace_ptr) { bwd_result,
PADDLE_ENFORCE_GPU_SUCCESS( output_grad_data,
paddle::platform::dynload::cudnnConvolutionBackwardData( filter_data,
handle, transformed_input_grad_data,
&alpha, groups,
args1.wdesc.desc(), group_offset_in,
filter_data + i * group_offset_filter, group_offset_filter,
args1.odesc.desc(), group_offset_out,
output_grad_data + i * group_offset_out, workspace_size,
args1.cdesc.desc(), &workspace_handle,
bwd_result.algo, use_addto);
cudnn_workspace_ptr,
workspace_size,
&beta,
args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in));
},
workspace_size);
}
#endif #endif
if (!is_sys_pad) { if (!is_sys_pad) {
std::vector<int> starts(transformed_input_channel.dims().size(), 0); std::vector<int> starts(transformed_input_channel.dims().size(), 0);
std::vector<int> axes(transformed_input_channel.dims().size(), 0); std::vector<int> axes(transformed_input_channel.dims().size(), 0);
...@@ -536,8 +528,6 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -536,8 +528,6 @@ void ConvCudnnGradKernel(const Context& ctx,
} }
} }
// filter_grad do not use inplace addto.
ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
...@@ -562,27 +552,19 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -562,27 +552,19 @@ void ConvCudnnGradKernel(const Context& ctx,
}, },
workspace_size); workspace_size);
#else #else
for (int i = 0; i < groups; i++) { ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
workspace_handle.RunFunc( args2,
[&](void* cudnn_workspace_ptr) { filter_result,
PADDLE_ENFORCE_GPU_SUCCESS( output_grad_data,
paddle::platform::dynload::cudnnConvolutionBackwardFilter( input_data,
handle, filter_grad_data,
&alpha, groups,
args2.idesc.desc(), group_offset_in,
input_data + i * group_offset_in, group_offset_filter,
args2.odesc.desc(), group_offset_out,
output_grad_data + i * group_offset_out, workspace_size,
args2.cdesc.desc(), &workspace_handle,
filter_result.algo, false);
cudnn_workspace_ptr,
workspace_size,
&beta_filter,
args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
},
workspace_size);
}
#endif #endif
if (compute_format == paddle::platform::DataLayout::kNHWC) { if (compute_format == paddle::platform::DataLayout::kNHWC) {
...@@ -952,7 +934,7 @@ void ConvCudnnGradGradKernel( ...@@ -952,7 +934,7 @@ void ConvCudnnGradGradKernel(
fwd_result1.algo = search1::Find<T>( fwd_result1.algo = search1::Find<T>(
args1, exhaustive_search, false, workspace_size, ctx); args1, exhaustive_search, false, workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search1 = SearchAlgorithm<ConvKind::kForward>;
fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false); fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
#endif #endif
...@@ -977,7 +959,7 @@ void ConvCudnnGradGradKernel( ...@@ -977,7 +959,7 @@ void ConvCudnnGradGradKernel(
fwd_result2.algo = search2::Find<T>( fwd_result2.algo = search2::Find<T>(
args2, exhaustive_search, false, workspace_size, ctx); args2, exhaustive_search, false, workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search2 = SearchAlgorithm<ConvKind::kForward>;
fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false); fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
...@@ -1003,7 +985,7 @@ void ConvCudnnGradGradKernel( ...@@ -1003,7 +985,7 @@ void ConvCudnnGradGradKernel(
filter_result.algo = search3::Find<T>( filter_result.algo = search3::Find<T>(
args3, exhaustive_search, deterministic, workspace_size, ctx); args3, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result = filter_result =
search3::Find<T>(ctx, args3, exhaustive_search, deterministic); search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
workspace_size = std::max( workspace_size = std::max(
...@@ -1030,7 +1012,7 @@ void ConvCudnnGradGradKernel( ...@@ -1030,7 +1012,7 @@ void ConvCudnnGradGradKernel(
data_result.algo = search4::Find<T>( data_result.algo = search4::Find<T>(
args4, exhaustive_search, deterministic, workspace_size, ctx); args4, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search4 = SearchAlgorithm<ConvKind::kBackwardData>;
data_result = data_result =
search4::Find<T>(ctx, args4, exhaustive_search, deterministic); search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
workspace_size = std::max( workspace_size = std::max(
...@@ -1088,27 +1070,19 @@ void ConvCudnnGradGradKernel( ...@@ -1088,27 +1070,19 @@ void ConvCudnnGradGradKernel(
}, },
workspace_size); workspace_size);
#else #else
for (int i = 0; i < groups; i++) { ConvRunner<T, ConvKind::kForward>::Apply(ctx,
workspace_handle.RunFunc( args1,
[&](void* workspace_ptr) { fwd_result1,
PADDLE_ENFORCE_GPU_SUCCESS( ddx,
paddle::platform::dynload::cudnnConvolutionForward( w,
handle, transformed_ddy_channel,
&alpha, groups,
args1.idesc.desc(), group_offset_in,
ddx + i * group_offset_in, group_offset_filter,
args1.wdesc.desc(), group_offset_out,
w + i * group_offset_filter, workspace_size,
args1.cdesc.desc(), &workspace_handle,
fwd_result1.algo, false);
workspace_ptr,
workspace_size,
&beta,
args1.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
#endif #endif
} }
if (ddW) { if (ddW) {
...@@ -1134,27 +1108,19 @@ void ConvCudnnGradGradKernel( ...@@ -1134,27 +1108,19 @@ void ConvCudnnGradGradKernel(
}, },
workspace_size); workspace_size);
#else #else
for (int i = 0; i < groups; i++) { ConvRunner<T, ConvKind::kForward>::Apply(ctx,
workspace_handle.RunFunc( args2,
[&](void* workspace_ptr) { fwd_result2,
PADDLE_ENFORCE_GPU_SUCCESS( x,
paddle::platform::dynload::cudnnConvolutionForward( ddw,
handle, transformed_ddy_channel,
&alpha, groups,
args2.idesc.desc(), group_offset_in,
x + i * group_offset_in, group_offset_filter,
args2.wdesc.desc(), group_offset_out,
ddw + i * group_offset_filter, workspace_size,
args2.cdesc.desc(), &workspace_handle,
fwd_result2.algo, true);
workspace_ptr,
workspace_size,
&alpha,
args2.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
},
workspace_size);
}
#endif #endif
} }
if (channel_last) { if (channel_last) {
...@@ -1185,27 +1151,19 @@ void ConvCudnnGradGradKernel( ...@@ -1185,27 +1151,19 @@ void ConvCudnnGradGradKernel(
}, },
workspace_size); workspace_size);
#else #else
for (int i = 0; i < groups; i++) { ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
workspace_handle.RunFunc( args3,
[&](void* workspace_ptr) { filter_result,
PADDLE_ENFORCE_GPU_SUCCESS( transformed_dy_channel,
paddle::platform::dynload::cudnnConvolutionBackwardFilter( ddx,
handle, dw,
&alpha, groups,
args3.idesc.desc(), group_offset_in,
ddx + i * group_offset_in, group_offset_filter,
args3.odesc.desc(), group_offset_out,
transformed_dy_channel + i * group_offset_out, workspace_size,
args3.cdesc.desc(), &workspace_handle,
filter_result.algo, false);
workspace_ptr,
workspace_size,
&beta,
args3.wdesc.desc(),
dw + i * group_offset_filter));
},
workspace_size);
}
#endif #endif
} }
...@@ -1232,27 +1190,19 @@ void ConvCudnnGradGradKernel( ...@@ -1232,27 +1190,19 @@ void ConvCudnnGradGradKernel(
}, },
workspace_size); workspace_size);
#else #else
for (int i = 0; i < groups; i++) { ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
workspace_handle.RunFunc( args4,
[&](void* workspace_ptr) { data_result,
PADDLE_ENFORCE_GPU_SUCCESS( transformed_dy_channel,
paddle::platform::dynload::cudnnConvolutionBackwardData( ddw,
handle, transformed_dx,
&alpha, groups,
args4.wdesc.desc(), group_offset_in,
ddw + i * group_offset_filter, group_offset_filter,
args4.odesc.desc(), group_offset_out,
transformed_dy_channel + i * group_offset_out, workspace_size,
args4.cdesc.desc(), &workspace_handle,
data_result.algo, false);
workspace_ptr,
workspace_size,
&beta,
args4.idesc.desc(),
transformed_dx + i * group_offset_in));
},
workspace_size);
}
#endif #endif
if (!is_sys_pad) { if (!is_sys_pad) {
......
...@@ -315,7 +315,7 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -315,7 +315,7 @@ void ConvCudnnKernel(const Context& ctx,
args, exhaustive_search, deterministic, workspace_size, ctx); args, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result; SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search = SearchAlgorithm<ConvKind::kForward>;
fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic); fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
workspace_size = fwd_result.workspace_size; workspace_size = fwd_result.workspace_size;
#endif #endif
...@@ -359,27 +359,19 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -359,27 +359,19 @@ void ConvCudnnKernel(const Context& ctx,
}, },
workspace_size); workspace_size);
#else #else
for (int i = 0; i < groups; i++) { ConvRunner<T, ConvKind::kForward>::Apply(ctx,
workspace_handle.RunFunc( args,
[&](void* workspace_ptr) { fwd_result,
PADDLE_ENFORCE_GPU_SUCCESS( input_data,
paddle::platform::dynload::cudnnConvolutionForward( filter_data,
handle, output_data,
&alpha, groups,
args.idesc.desc(), group_offset_in,
input_data + i * group_offset_in, group_offset_filter,
args.wdesc.desc(), group_offset_out,
filter_data + i * group_offset_filter, workspace_size,
args.cdesc.desc(), &workspace_handle,
fwd_result.algo, false);
workspace_ptr,
workspace_size,
&beta,
args.odesc.desc(),
output_data + i * group_offset_out));
},
workspace_size);
}
#endif #endif
if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
......
...@@ -227,7 +227,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -227,7 +227,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
fwd_result.algo = fwd_result.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search1 = SearchAlgorithm<ConvKind::kForward>;
fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false); fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
...@@ -252,7 +252,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -252,7 +252,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
filter_result.algo = filter_result.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result = search2::Find<T>(ctx, args2, false, deterministic, false); filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
...@@ -269,9 +269,9 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -269,9 +269,9 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle(); auto workspace_handle = ctx.cudnn_workspace_handle();
if (dx) { if (dx) {
#ifdef PADDLE_WITH_HIP
// Because beta is zero, it is unnecessary to reset dx. // Because beta is zero, it is unnecessary to reset dx.
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenConvolutionForward(handle, dynload::miopenConvolutionForward(handle,
...@@ -288,26 +288,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -288,26 +288,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
cudnn_workspace, cudnn_workspace,
workspace_size)); workspace_size));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
#else // PADDLE_WITH_HIP #else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) { ConvRunner<T, ConvKind::kForward>::Apply(ctx,
PADDLE_ENFORCE_GPU_SUCCESS( args1,
dynload::cudnnConvolutionForward(handle, fwd_result,
&alpha, dout_data,
args1.idesc.desc(), filter_data,
dout_data + dout_offset * g, dx_data,
args1.wdesc.desc(), groups,
filter_data + filter_offset * g, dout_offset,
args1.cdesc.desc(), filter_offset,
fwd_result.algo, x_offset,
cudnn_workspace,
workspace_size, workspace_size,
&beta, &workspace_handle,
args1.odesc.desc(), false);
dx_data + x_offset * g));
};
#endif // PADDLE_WITH_HIP #endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (data_layout == GPUDNNDataLayout::kNHWC) { if (data_layout == GPUDNNDataLayout::kNHWC) {
DenseTensor dx_transpose; DenseTensor dx_transpose;
...@@ -330,8 +327,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -330,8 +327,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
if (dfilter) { if (dfilter) {
// Because beta is zero, it is unnecessary to reset dfilter. // Because beta is zero, it is unnecessary to reset dfilter.
// Gradient with respect to the filter // Gradient with respect to the filter
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights( PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
handle, handle,
...@@ -348,26 +345,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -348,26 +345,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
cudnn_workspace, cudnn_workspace,
workspace_size)); workspace_size));
}; };
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
handle,
&alpha,
args2.idesc.desc(),
dout_data + dout_offset * g,
args2.odesc.desc(),
x_data + x_offset * g,
args2.cdesc.desc(),
filter_result.algo,
cudnn_workspace,
workspace_size,
&beta,
args2.wdesc.desc(),
dfilter_data + filter_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size); workspace_handle.RunFunc(cudnn_func, workspace_size);
} }
#else // PADDLE_WITH_HIP
ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
args2,
filter_result,
x_data,
dout_data,
dfilter_data,
groups,
dout_offset,
filter_offset,
x_offset,
workspace_size,
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
} }
} }
...@@ -704,7 +698,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -704,7 +698,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
bwd_result1.algo = bwd_result1.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false); bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
#endif #endif
...@@ -726,7 +720,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -726,7 +720,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
bwd_result2.algo = bwd_result2.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false); bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
...@@ -751,7 +745,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -751,7 +745,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
filter_result.algo = filter_result.algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx); search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
#else #else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
filter_result = search3::Find<T>(ctx, args3, false, deterministic, false); filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
...@@ -777,7 +771,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -777,7 +771,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
fwd_result.algo = fwd_result.algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx); search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
#else #else
using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search4 = SearchAlgorithm<ConvKind::kForward>;
fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false); fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
...@@ -815,8 +809,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -815,8 +809,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
if (ddout) { if (ddout) {
ddx_ = transformed_ddx.data<T>(); ddx_ = transformed_ddx.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
...@@ -835,30 +829,25 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -835,30 +829,25 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
workspace_size)); workspace_size));
}, },
workspace_size); workspace_size);
}
#else // PADDLE_WITH_HIP #else // PADDLE_WITH_HIP
workspace_handle.RunFunc( ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
[&](void* workspace_ptr) { args1,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( bwd_result1,
handle, ddx_,
&alpha, filter_,
args1.wdesc.desc(), transformed_ddout_channel_,
filter_ + i * group_offset_filter, groups,
args1.odesc.desc(), group_offset_out,
ddx_ + i * group_offset_in, group_offset_filter,
args1.cdesc.desc(), group_offset_in,
bwd_result1.algo, workspace_size,
workspace_ptr, &workspace_handle,
workspace_size, false);
&beta,
args1.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
},
workspace_size);
#endif // PADDLE_WITH_HIP #endif // PADDLE_WITH_HIP
}
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
// MIOPEN ONLY support beta to be 0.0f // MIOPEN ONLY support beta to be 0.0f
DenseTensor conv_x_ddfilter(dout.type()); DenseTensor conv_x_ddfilter(dout.type());
conv_x_ddfilter.Resize(transformed_ddout_channel.dims()); conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
...@@ -893,27 +882,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -893,27 +882,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
&beta, &beta,
args2.idesc.desc(), args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out)); transformed_ddout_channel_ + i * group_offset_out));
}
#else // PADDLE_WITH_HIP #else // PADDLE_WITH_HIP
workspace_handle.RunFunc( ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
[&](void* workspace_ptr) { args2,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( bwd_result2,
handle, x_,
&alpha, ddfilter_,
args2.wdesc.desc(), transformed_ddout_channel_,
ddfilter_ + i * group_offset_filter, groups,
args2.odesc.desc(), group_offset_out,
x_ + i * group_offset_in, group_offset_filter,
args2.cdesc.desc(), group_offset_in,
bwd_result2.algo, workspace_size,
workspace_ptr, &workspace_handle,
workspace_size, true);
&alpha,
args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
},
workspace_size);
#endif // PADDLE_WITH_HIP #endif // PADDLE_WITH_HIP
}
if ((!is_sys_pad) && (!channel_last)) { if ((!is_sys_pad) && (!channel_last)) {
if (strides.size() == 2U) { if (strides.size() == 2U) {
...@@ -947,8 +931,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -947,8 +931,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
T* transformed_dout_channel_ = transformed_dout.data<T>(); T* transformed_dout_channel_ = transformed_dout.data<T>();
if (dfilter) { if (dfilter) {
ddx_ = transformed_ddx_channel.data<T>(); ddx_ = transformed_ddx_channel.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
...@@ -968,33 +952,28 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -968,33 +952,28 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
workspace_size)); workspace_size));
}, },
workspace_size); workspace_size);
}
#else // PADDLE_WITH_HIP #else // PADDLE_WITH_HIP
workspace_handle.RunFunc( ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
[&](void* workspace_ptr) { args3,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter( filter_result,
handle, ddx_,
&alpha, transformed_dout_channel_,
args3.idesc.desc(), dfilter_,
transformed_dout_channel_ + i * group_offset_out, groups,
args3.odesc.desc(), group_offset_out,
ddx_ + i * group_offset_in, group_offset_filter,
args3.cdesc.desc(), group_offset_in,
filter_result.algo, workspace_size,
workspace_ptr, &workspace_handle,
workspace_size, false);
&beta,
args3.wdesc.desc(),
dfilter_ + i * group_offset_filter));
},
workspace_size);
#endif // PADDLE_WITH_HIP #endif // PADDLE_WITH_HIP
}
} }
if (dx) { if (dx) {
ddfilter_ = ddfilter.data<T>(); ddfilter_ = ddfilter.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward( PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
...@@ -1013,27 +992,23 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -1013,27 +992,23 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
workspace_size)); workspace_size));
}, },
workspace_size); workspace_size);
}
#else // PADDLE_WITH_HIP #else // PADDLE_WITH_HIP
workspace_handle.RunFunc( ConvRunner<T, ConvKind::kForward>::Apply(ctx,
[&](void* workspace_ptr) { args4,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward( fwd_result,
handle, transformed_dout_channel_,
&alpha, ddfilter_,
args4.idesc.desc(), transformed_dx_,
transformed_dout_channel_ + i * group_offset_out, groups,
args4.wdesc.desc(), group_offset_out,
ddfilter_ + i * group_offset_filter, group_offset_filter,
args4.cdesc.desc(), group_offset_in,
fwd_result.algo, workspace_size,
workspace_ptr, &workspace_handle,
workspace_size, false);
&beta,
args4.odesc.desc(),
transformed_dx_ + i * group_offset_in));
},
workspace_size);
#endif // PADDLE_WITH_HIP #endif // PADDLE_WITH_HIP
}
if (channel_last) { if (channel_last) {
TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx); TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
} }
......
...@@ -227,7 +227,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -227,7 +227,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
search::Find<T>(args, false, deterministic, workspace_size, ctx); search::Find<T>(args, false, deterministic, workspace_size, ctx);
#else #else
SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result; SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search = SearchAlgorithm<ConvKind::kBackwardData>;
bwd_result = search::Find<T>(ctx, args, false, deterministic, false); bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
workspace_size = workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
...@@ -240,8 +240,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -240,8 +240,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
ScalingParamType<T> alpha = 1.0f; ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f; ScalingParamType<T> beta = 0.0f;
auto workspace_handle = ctx.cudnn_workspace_handle(); auto workspace_handle = ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
handle, handle,
...@@ -258,26 +258,24 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -258,26 +258,24 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
cudnn_workspace, cudnn_workspace,
workspace_size)); workspace_size));
}; };
#else // PADDLE_WITH_HIP
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
&alpha,
args.wdesc.desc(),
filter_data + filter_offset * g,
args.odesc.desc(),
x_data + x_offset * g,
args.cdesc.desc(),
bwd_result.algo,
cudnn_workspace,
workspace_size,
&beta,
args.idesc.desc(),
transformed_out_data + out_offset * g));
};
#endif // PADDLE_WITH_HIP
workspace_handle.RunFunc(cudnn_func, workspace_size); workspace_handle.RunFunc(cudnn_func, workspace_size);
} }
#else // PADDLE_WITH_HIP
ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
args,
bwd_result,
x_data,
filter_data,
transformed_out_data,
groups,
out_offset,
filter_offset,
x_offset,
workspace_size,
&workspace_handle,
false);
#endif // PADDLE_WITH_HIP
if (!is_sys_pad && strides.size() == 2U) { if (!is_sys_pad && strides.size() == 2U) {
funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes); funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
} else if (!is_sys_pad && strides.size() == 3U) { } else if (!is_sys_pad && strides.size() == 3U) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册