未验证 提交 bc47e7ac 编写于 作者: Y Yiqun Liu 提交者: GitHub

Enhance the implementation of some conv functions. (#47281)

上级 2f3ad5ab
......@@ -75,9 +75,9 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
SearchResult<AlgoT>* search_result) {
int best_algo_idx = -1;
for (size_t i = 0; i < perf_results.size(); ++i) {
auto result = perf_results[i];
const auto& result = perf_results[i];
if (result.status == CUDNN_STATUS_SUCCESS &&
result.memory < workspace_limit) {
result.memory <= workspace_limit) {
if (best_algo_idx == -1) {
// The algorithm which has minimize time cost and need a workspace_size
// fitting the workspace_limit constraint.
......@@ -87,8 +87,10 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
break;
}
} else {
float best_algo_time = perf_results[best_algo_idx].time;
if ((result.time - best_algo_time) / best_algo_time < 0.01) {
// Compared to the next suboptimal algorithm, if the best one only has
// 1% performance difference, we'd like to pick the one which need less
// memory.
if (result.time < 1.01 * perf_results[best_algo_idx].time) {
best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
? i
: best_algo_idx;
......@@ -98,9 +100,15 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
}
}
if (best_algo_idx != -1) {
search_result->algo = perf_results[best_algo_idx].algo;
search_result->time = perf_results[best_algo_idx].time;
search_result->workspace_size = perf_results[best_algo_idx].memory;
const auto& result = perf_results[best_algo_idx];
search_result->algo = result.algo;
search_result->time = result.time;
search_result->workspace_size = result.memory;
auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
VLOG(3) << "Choose algo=" << result.algo
<< ", tensor_core=" << math_type_str << ", time=" << result.time
<< " ms, memory=" << ToMegaBytes(result.memory)
<< " MB, status=" << result.status;
} else {
VLOG(3) << "Can not find an algorithm that requires memory < "
<< ToMegaBytes(workspace_limit) << " MB";
......@@ -626,7 +634,8 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
perf_results,
perf_results.size(),
workspace_size_limit);
ChooseAlgo(perf_results, workspace_size_limit, &result);
ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result);
}
result.workspace_size = GetWorkspaceSize(args, result.algo);
......@@ -673,42 +682,6 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
return workspace_size_limit;
}
}
static void ChooseAlgo(const std::vector<PerfT>& perf_results,
size_t workspace_limit,
SearchResult<AlgoT>* algo_result) {
for (size_t i = 0; i != perf_results.size(); ++i) {
const auto& result = perf_results[i];
if (result.status == CUDNN_STATUS_SUCCESS &&
(result.memory <= workspace_limit)) {
if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
(i != perf_results.size() - 1)) {
const auto& next_result = perf_results[i + 1];
if (next_result.status == CUDNN_STATUS_SUCCESS &&
next_result.algo == result.algo &&
next_result.memory == result.memory &&
next_result.mathType != CUDNN_TENSOR_OP_MATH &&
next_result.time < 1.01 * result.time) {
// Skip over this result- it's not really a Tensor Core algo.
// Because it is only 1% performance difference.
// Prefer to choose the next equivalent non-Tensor Core algo.
continue;
}
}
algo_result->algo = result.algo;
algo_result->time = result.time;
auto math_type_str = "0";
if (result.mathType == CUDNN_TENSOR_OP_MATH) {
math_type_str = "1";
}
VLOG(3) << " choose algo: " << result.algo
<< ", TC: " << math_type_str << ", time: " << result.time
<< " ms, wksp = " << result.memory
<< ", status = " << result.status;
break;
}
}
}
};
template <typename PerfT>
......@@ -735,7 +708,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
// Auto tune is only enabled between specified range.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto key = args.ConvertToConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
SearchAlgorithmBase<PerfT>::kAlgoType);
bool find_in_cache = cache.Find(key);
......@@ -746,7 +719,6 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result.exhaustive_search = t.exhaustive_search;
}
if (!result.exhaustive_search) {
bool need_update_cache = false;
// In conv2d_tranpose, enable_autotune is set to false because some
// algorithm picked by exhaustive search method produce wrong result.
use_autotune = enable_autotune &&
......@@ -757,17 +729,18 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
result =
SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
args, ctx);
need_update_cache = true;
cache.Set(key,
phi::autotune::ConvAutoTuneResult(
static_cast<int64_t>(result.algo),
result.workspace_size,
true));
} else if (!find_in_cache) {
result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
need_update_cache = true;
}
if (need_update_cache) {
phi::autotune::ConvAutoTuneResult node(
static_cast<int64_t>(result.algo),
result.workspace_size,
exhaustive_search || use_autotune);
cache.Set(key, node);
cache.Set(key,
phi::autotune::ConvAutoTuneResult(
static_cast<int64_t>(result.algo),
result.workspace_size,
false));
}
}
}
......
......@@ -69,10 +69,15 @@ static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
template <typename HandleT, typename DataT>
struct ConvArgsBase {
HandleT handle;
paddle::platform::TensorDescriptor idesc, odesc;
paddle::platform::TensorDescriptor idesc;
paddle::platform::TensorDescriptor odesc;
paddle::platform::FilterDescriptor wdesc;
paddle::platform::ConvolutionDescriptor cdesc;
const phi::DenseTensor *x, *w, *o;
const phi::DenseTensor* x = nullptr;
const phi::DenseTensor* w = nullptr;
const phi::DenseTensor* o = nullptr;
DataT cudnn_dtype;
// strides
......@@ -88,7 +93,8 @@ struct ConvArgsBase {
// data foramt
GPUDNNDataLayout data_layout;
ConvArgsBase(const phi::DenseTensor* x,
ConvArgsBase(const HandleT& h,
const phi::DenseTensor* x,
const phi::DenseTensor* w,
const phi::DenseTensor* o,
const std::vector<int> s,
......@@ -97,7 +103,8 @@ struct ConvArgsBase {
DataT dtype,
int g,
GPUDNNDataLayout layout)
: x(x),
: handle(h),
x(x),
w(w),
o(o),
s(s),
......@@ -108,7 +115,7 @@ struct ConvArgsBase {
data_layout(layout) {}
template <typename T>
phi::autotune::ConvCacheKey Convert2ConvCacheKey() const {
phi::autotune::ConvCacheKey ConvertToConvCacheKey() const {
auto x_shape = phi::vectorize(x->dims());
auto w_shape = phi::vectorize(w->dims());
VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
......
......@@ -257,7 +257,8 @@ void ConvCudnnGradGradKernel(
auto layout = paddle::platform::GetCudnnTensorFormat(
paddle::platform::DataLayout::kNCHW);
ConvArgs args1{&transformed_ddX,
ConvArgs args1{handle,
&transformed_ddX,
W,
&transformed_ddO_channel,
strides,
......@@ -266,7 +267,8 @@ void ConvCudnnGradGradKernel(
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
ConvArgs args2{&transformed_X,
ConvArgs args2{handle,
&transformed_X,
ddW,
&transformed_ddO_channel,
strides,
......@@ -275,7 +277,8 @@ void ConvCudnnGradGradKernel(
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
ConvArgs args3{&transformed_ddX,
ConvArgs args3{handle,
&transformed_ddX,
dW,
&transformed_dO_channel,
strides,
......@@ -284,7 +287,8 @@ void ConvCudnnGradGradKernel(
dtype,
groups,
paddle::platform::DataLayout::kNCHW};
ConvArgs args4{&transformed_dX,
ConvArgs args4{handle,
&transformed_dX,
ddW,
&transformed_dO_channel,
strides,
......@@ -314,7 +318,6 @@ void ConvCudnnGradGradKernel(
ddy = ddO->data<T>();
transformed_ddy_channel = transformed_ddO_channel.data<T>();
if (ddX) {
args1.handle = handle;
args1.idesc.set(transformed_ddX, iwo_group);
args1.wdesc.set(*W, layout, iwo_group);
args1.odesc.set(transformed_ddO_channel, iwo_group);
......@@ -339,7 +342,6 @@ void ConvCudnnGradGradKernel(
if (ddW) {
ddw = ddW->data<T>();
args2.handle = handle;
args2.idesc.set(transformed_X, iwo_group);
args2.wdesc.set(*ddW, layout, iwo_group);
args2.odesc.set(transformed_ddO_channel, iwo_group);
......@@ -367,7 +369,6 @@ void ConvCudnnGradGradKernel(
if (dW && ddX) {
dw = dW->data<T>();
args3.handle = handle;
args3.idesc.set(transformed_ddX, iwo_group);
args3.wdesc.set(*dW, layout, iwo_group);
args3.odesc.set(transformed_dO_channel, iwo_group);
......@@ -395,7 +396,6 @@ void ConvCudnnGradGradKernel(
if (ddW && dX) {
transformed_dx = transformed_dX.data<T>();
args4.handle = handle;
args4.idesc.set(transformed_dX, iwo_group);
args4.wdesc.set(*ddW, layout, iwo_group);
args4.odesc.set(transformed_dO_channel, iwo_group);
......@@ -444,13 +444,13 @@ void ConvCudnnGradGradKernel(
// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
// 0.0f;
// VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
auto wkspace_handle = ctx.cudnn_workspace_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
if (ddO) {
if (ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
......@@ -471,7 +471,7 @@ void ConvCudnnGradGradKernel(
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
......@@ -496,7 +496,7 @@ void ConvCudnnGradGradKernel(
if (ddW) {
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionForward(
......@@ -517,7 +517,7 @@ void ConvCudnnGradGradKernel(
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionForward(
......@@ -547,7 +547,7 @@ void ConvCudnnGradGradKernel(
if (dW && ddX) {
ddx = transformed_ddX.data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardWeights(
......@@ -568,7 +568,7 @@ void ConvCudnnGradGradKernel(
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardFilter(
......@@ -594,7 +594,7 @@ void ConvCudnnGradGradKernel(
if (dX && ddW) {
ddw = ddW->data<T>();
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenConvolutionBackwardData(
......@@ -615,7 +615,7 @@ void ConvCudnnGradGradKernel(
workspace_size);
#else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnConvolutionBackwardData(
......
......@@ -251,12 +251,14 @@ void ConvCudnnGradKernel(const Context& ctx,
T* input_grad_data = nullptr;
T* transformed_input_grad_data = nullptr;
auto handle = ctx.cudnn_handle();
paddle::platform::DataLayout layout =
compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNHWC
: paddle::platform::DataLayout::kNCHW;
ConvArgs args1{&transformed_input_grad,
ConvArgs args1{handle,
&transformed_input_grad,
&transformed_filter_channel,
&transformed_output_grad_channel,
strides,
......@@ -265,7 +267,8 @@ void ConvCudnnGradKernel(const Context& ctx,
dtype,
groups,
layout};
ConvArgs args2{&transformed_input,
ConvArgs args2{handle,
&transformed_input,
&transformed_filter_grad_channel,
&transformed_output_grad_channel,
strides,
......@@ -275,7 +278,6 @@ void ConvCudnnGradKernel(const Context& ctx,
groups,
layout};
auto handle = ctx.cudnn_handle();
// TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
if (transformed_input.dims().size() == 5) {
......@@ -332,10 +334,7 @@ void ConvCudnnGradKernel(const Context& ctx,
SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
#endif
// input data workspace_size
size_t workspace_size_d = 0;
// weight workspace_size
size_t workspace_size_w = 0;
size_t workspace_size = 0;
int iwo_groups = groups;
int c_groups = 1;
......@@ -350,7 +349,6 @@ void ConvCudnnGradKernel(const Context& ctx,
input_grad_data = input_grad->data<T>();
transformed_input_grad_data = transformed_input_grad.data<T>();
args1.handle = handle;
args1.idesc.set(transformed_input_grad, layout_tensor);
args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
args1.odesc.set(transformed_output_grad_channel, layout_tensor);
......@@ -363,21 +361,20 @@ void ConvCudnnGradKernel(const Context& ctx,
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
bwd_result.algo = search1::Find<T>(
args1, exhaustive_search, deterministic, workspace_size_d, ctx);
args1, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
workspace_size = std::max(workspace_size, bwd_result.workspace_size);
#endif
}
if (filter_grad) {
// ------------------- cudnn descriptors ---------------------
filter_grad_data = transformed_filter_grad_channel.data<T>();
args2.handle = handle;
args2.idesc.set(transformed_input, layout_tensor);
args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups);
args2.odesc.set(transformed_output_grad_channel, layout_tensor);
......@@ -389,17 +386,16 @@ void ConvCudnnGradKernel(const Context& ctx,
c_groups);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size_w =
std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_result.algo = search2::Find<T>(
args2, exhaustive_search, deterministic, workspace_size_w, ctx);
args2, exhaustive_search, deterministic, workspace_size, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result =
search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
VLOG(3) << "filter algo: " << filter_result.algo << ", time "
<< filter_result.time;
workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size);
workspace_size = std::max(workspace_size, filter_result.workspace_size);
#endif
}
......@@ -438,9 +434,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args1.idesc.desc(),
temp_tensor_data,
cudnn_workspace_ptr,
workspace_size_d));
workspace_size));
},
workspace_size_d);
workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
handle,
miopenTensorOpAdd,
......@@ -470,9 +466,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args1.idesc.desc(),
transformed_input_grad_data,
cudnn_workspace_ptr,
workspace_size_d));
workspace_size));
},
workspace_size_d);
workspace_size);
}
#else
......@@ -490,12 +486,12 @@ void ConvCudnnGradKernel(const Context& ctx,
args1.cdesc.desc(),
bwd_result.algo,
cudnn_workspace_ptr,
workspace_size_d,
workspace_size,
&beta,
args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in));
},
workspace_size_d);
workspace_size);
}
#endif
if (!is_sys_pad) {
......@@ -551,9 +547,9 @@ void ConvCudnnGradKernel(const Context& ctx,
args2.wdesc.desc(),
filter_grad_data,
cudnn_workspace_ptr,
workspace_size_w));
workspace_size));
},
workspace_size_w);
workspace_size);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
......@@ -569,12 +565,12 @@ void ConvCudnnGradKernel(const Context& ctx,
args2.cdesc.desc(),
filter_result.algo,
cudnn_workspace_ptr,
workspace_size_w,
workspace_size,
&beta_filter,
args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
},
workspace_size_w);
workspace_size);
}
#endif
......
......@@ -201,11 +201,14 @@ void ConvCudnnKernel(const Context& ctx,
}
const T* input_data = transformed_input.data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
auto handle = ctx.cudnn_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
// ------------------- cudnn descriptors ---------------------
ConvArgs args{&transformed_input,
ConvArgs args{handle,
&transformed_input,
&transformed_filter_channel,
&transformed_output,
strides,
......@@ -215,8 +218,6 @@ void ConvCudnnKernel(const Context& ctx,
groups,
compute_format};
auto handle = ctx.cudnn_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
paddle::platform::DataLayout layout =
compute_format == paddle::platform::DataLayout::kNHWC
? paddle::platform::DataLayout::kNHWC
......@@ -228,8 +229,6 @@ void ConvCudnnKernel(const Context& ctx,
}
auto layout_format = paddle::platform::GetCudnnTensorFormat(layout);
args.handle = handle;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args.cdesc.set(dtype,
......
......@@ -172,8 +172,10 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
#endif
auto dtype = paddle::platform::CudnnDataType<T>::type;
auto handle = ctx.cudnn_handle();
ConvArgs args1{&transformed_dout,
ConvArgs args1{handle,
&transformed_dout,
&filter,
&x_transpose,
strides,
......@@ -182,7 +184,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
dtype,
groups,
layout};
ConvArgs args2{&transformed_dout,
ConvArgs args2{handle,
&transformed_dout,
&filter,
&x_transpose,
strides,
......@@ -202,14 +205,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
size_t workspace_size = 0;
auto handle = ctx.cudnn_handle();
bool deterministic = FLAGS_cudnn_deterministic;
T* dx_data = nullptr;
T* dfilter_data = nullptr;
if (dx) {
dx_data = ctx.template Alloc<T>(dx);
args1.handle = handle;
args1.idesc.set(transformed_dout, iwo_groups);
args1.wdesc.set(filter, layout_tensor, iwo_groups);
args1.odesc.set(x_transpose, iwo_groups);
......@@ -234,7 +236,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
if (dfilter) {
dfilter_data = ctx.template Alloc<T>(dfilter);
args2.handle = handle;
args2.idesc.set(transformed_dout, iwo_groups);
args2.wdesc.set(*dfilter, layout_tensor, iwo_groups);
args2.odesc.set(x_transpose, iwo_groups);
......@@ -625,7 +627,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
auto handle = ctx.cudnn_handle();
auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
ConvArgs args1{&transformed_ddout_channel,
ConvArgs args1{handle,
&transformed_ddout_channel,
&filter,
&transformed_ddx,
strides,
......@@ -634,7 +637,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dtype,
groups,
GPUDNNDataLayout::kNCHW};
ConvArgs args2{&transformed_ddout_channel,
ConvArgs args2{handle,
&transformed_ddout_channel,
&ddfilter,
&transformed_x,
strides,
......@@ -644,7 +648,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
groups,
GPUDNNDataLayout::kNCHW};
ConvArgs args3{&transformed_dout,
ConvArgs args3{handle,
&transformed_dout,
dfilter,
&transformed_ddx_channel,
strides,
......@@ -653,7 +658,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dtype,
groups,
GPUDNNDataLayout::kNCHW};
ConvArgs args4{&transformed_dout,
ConvArgs args4{handle,
&transformed_dout,
&ddfilter,
&transformed_dx_channel,
strides,
......@@ -683,7 +689,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddout_ = ddout->data<T>();
transformed_ddout_channel_ = transformed_ddout_channel.data<T>();
args1.handle = handle;
args1.idesc.set(transformed_ddout_channel, iwo_group);
args1.wdesc.set(filter, layout, iwo_group);
args1.odesc.set(transformed_ddx, iwo_group);
......@@ -730,7 +735,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
if (dfilter) {
dfilter_ = dfilter->data<T>();
args3.handle = handle;
args3.idesc.set(transformed_dout, iwo_group);
args3.wdesc.set(*dfilter, layout, iwo_group);
args3.odesc.set(transformed_ddx_channel, iwo_group);
......@@ -806,13 +811,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = 0.0f;
auto wkspace_handle = ctx.cudnn_workspace_handle();
auto workspace_handle = ctx.cudnn_workspace_handle();
if (ddout) {
ddx_ = transformed_ddx.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
handle,
......@@ -831,7 +836,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
......@@ -858,7 +863,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
DenseTensor conv_x_ddfilter(dout.type());
conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
T* conv_x_ddfilter_data = ctx.template Alloc<T>(&conv_x_ddfilter);
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
handle,
......@@ -889,7 +894,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args2.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out));
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
handle,
......@@ -944,7 +949,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddx_ = transformed_ddx_channel.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenConvolutionBackwardWeights(
......@@ -964,7 +969,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
handle,
......@@ -990,7 +995,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
ddfilter_ = ddfilter.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
handle,
......@@ -1009,7 +1014,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
},
workspace_size);
#else // PADDLE_WITH_HIP
wkspace_handle.RunFunc(
workspace_handle.RunFunc(
[&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward(
handle,
......
......@@ -199,7 +199,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
auto dtype = paddle::platform::CudnnDataType<T>::type;
// ------------------- cudnn descriptors ---------------------
ConvArgs args{&transformed_out,
ConvArgs args{handle,
&transformed_out,
&filter,
&transformed_x,
strides,
......@@ -208,7 +209,6 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
dtype,
groups,
data_layout};
args.handle = handle;
args.idesc.set(transformed_out, iwo_groups);
args.wdesc.set(filter, layout_tensor, iwo_groups);
args.odesc.set(transformed_x, iwo_groups);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册