未验证 提交 b721e23b 编写于 作者: W wangchaochaohu 提交者: GitHub

transpose cudnn using cudnn v7 api (#19738)

* refine the transopose conv using v7 to choose algorithm
上级 11f94cdc
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/operators/math/padding.h"
...@@ -24,13 +25,8 @@ namespace paddle { ...@@ -24,13 +25,8 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout; using DataLayout = platform::DataLayout;
static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024;
template <typename T, int D> template <typename T, int D>
static void DataTranspose(const framework::ExecutionContext& ctx, static void DataTranspose(const framework::ExecutionContext& ctx,
const Tensor* input, Tensor* output, const Tensor* input, Tensor* output,
...@@ -68,7 +64,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -68,7 +64,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
// cudnn v5 does not support dilations // cudnn v5 does not support dilations
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations"); std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
const std::string data_layout_str = ctx.Attr<std::string>("data_format"); const std::string data_layout_str = ctx.Attr<std::string>("data_format");
const paddle::operators::DataLayout data_layout = const paddle::operators::DataLayout data_layout =
...@@ -200,60 +195,44 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -200,60 +195,44 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
} }
T* transformed_output_data = transformed_output.data<T>(); T* transformed_output_data = transformed_output.data<T>();
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc;
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout; DataLayout layout;
int iwo_groups = groups;
int c_groups = 1;
#if CUDNN_VERSION_MIN(7, 0, 1)
iwo_groups = 1;
c_groups = groups;
groups = 1;
#endif
if (strides.size() == 2U) { if (strides.size() == 2U) {
layout = DataLayout::kNCHW; layout = DataLayout::kNCHW;
} else { } else {
layout = DataLayout::kNCDHW; layout = DataLayout::kNCDHW;
} }
// (N, M, H, W) or (N, M, D, H, W) size_t workspace_size = 0;
cudnnTensorDescriptor_t cudnn_input_desc = cudnnConvolutionBwdDataAlgo_t algo{};
input_desc.descriptor<T>(layout, input_vec, groups);
// (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
cudnnTensorDescriptor_t cudnn_output_desc =
output_desc.descriptor<T>(layout, transformed_output_vec, groups);
// (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize<int>(filter->dims()), groups);
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(padding_common, strides, dilations);
// ------------------- cudnn conv workspace ---------------------
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
}
// ------------------- cudnn conv algorithm --------------------- // ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionBwdDataAlgo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
// Get the algorithm auto layout_tensor = GetCudnnTensorFormat(layout);
PADDLE_ENFORCE_CUDA_SUCCESS( bool deterministic = FLAGS_cudnn_deterministic;
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_output_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
if (FLAGS_cudnn_deterministic) { auto dtype = platform::CudnnDataType<T>::type;
algo = static_cast<cudnnConvolutionBwdDataAlgo_t>(1); // ------------------- cudnn descriptors ---------------------
} ConvArgs args{&transformed_output, filter, &transformed_input, strides,
padding_common, dilations};
// get workspace size able to allocate args.handle = handle;
PADDLE_ENFORCE_CUDA_SUCCESS( args.idesc.set(transformed_output, iwo_groups);
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( args.wdesc.set(*filter, layout_tensor, iwo_groups);
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, args.odesc.set(transformed_input, iwo_groups);
cudnn_output_desc, algo, &workspace_size_in_bytes)); args.cdesc.set(dtype, padding_common, strides, dilations, c_groups);
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
algo = search::Find<T>(args, false, deterministic, 2, ctx);
workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, algo));
// ------------------- cudnn conv transpose forward --------------------- // ------------------- cudnn conv transpose forward ---------------------
int input_offset = int input_offset =
...@@ -267,16 +246,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -267,16 +246,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData( platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, handle, &alpha, args.wdesc.desc(),
filter_data + filter_offset * g, cudnn_input_desc, filter_data + filter_offset * g, args.odesc.desc(),
input_data + input_offset * g, cudnn_conv_desc, algo, input_data + input_offset * g, args.cdesc.desc(), algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_workspace, workspace_size, &beta, args.idesc.desc(),
cudnn_output_desc,
transformed_output_data + output_offset * g)); transformed_output_data + output_offset * g));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size);
} }
if (!is_sys_pad && strides.size() == 2U) { if (!is_sys_pad && strides.size() == 2U) {
Slice<paddle::platform::CUDADeviceContext, T, 4>( Slice<paddle::platform::CUDADeviceContext, T, 4>(
ctx, &transformed_output, output, starts, ends, axes); ctx, &transformed_output, output, starts, ends, axes);
...@@ -432,10 +409,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -432,10 +409,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_vec = framework::vectorize<int>(transformed_output_grad.dims()); output_vec = framework::vectorize<int>(transformed_output_grad.dims());
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc;
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout; DataLayout layout;
if (strides.size() == 2U) { if (strides.size() == 2U) {
...@@ -444,68 +417,59 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -444,68 +417,59 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
layout = DataLayout::kNCDHW; layout = DataLayout::kNCDHW;
} }
// Input: (N, M, H, W) or (N, M, D, H, W) int iwo_groups = groups;
cudnnTensorDescriptor_t cudnn_input_desc = int c_groups = 1;
input_desc.descriptor<T>(layout, input_vec, groups); #if CUDNN_VERSION_MIN(7, 0, 1)
// Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) iwo_groups = 1;
cudnnTensorDescriptor_t cudnn_output_desc = c_groups = groups;
output_desc.descriptor<T>(layout, output_vec, groups); groups = 1;
// Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) #endif
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>( ConvArgs args1{&transformed_output_grad, filter,
layout, framework::vectorize<int>(filter->dims()), groups); &input_transpose, strides,
padding_common, dilations};
cudnnConvolutionDescriptor_t cudnn_conv_desc = ConvArgs args2{&transformed_output_grad, filter,
conv_desc.descriptor<T>(padding_common, strides, dilations); &input_transpose, strides,
padding_common, dilations};
// ------------------- cudnn backward algorithm --------------------- cudnnConvolutionFwdAlgo_t data_algo{};
cudnnConvolutionFwdAlgo_t data_algo; cudnnConvolutionBwdFilterAlgo_t filter_algo{};
cudnnConvolutionBwdFilterAlgo_t filter_algo;
size_t bwd_filter_ws_size, fwd_ws_size; auto layout_tensor = GetCudnnTensorFormat(layout);
size_t workspace_size_in_bytes = 0; size_t workspace_size = 0;
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
}
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
if (input_grad) { auto dtype = platform::CudnnDataType<T>::type;
// choose backward algorithm for data bool deterministic = FLAGS_cudnn_deterministic;
PADDLE_ENFORCE_CUDA_SUCCESS( T* input_grad_data = nullptr;
platform::dynload::cudnnGetConvolutionForwardAlgorithm( T* filter_grad_data = nullptr;
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, if (input_grad)
cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
workspace_size_limit, &data_algo)); if (filter_grad)
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
if (FLAGS_cudnn_deterministic) { if (input_grad) {
data_algo = static_cast<cudnnConvolutionFwdAlgo_t>(1); input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
} args1.handle = handle;
PADDLE_ENFORCE_CUDA_SUCCESS( args1.idesc.set(transformed_output_grad, iwo_groups);
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( args1.wdesc.set(*filter, layout_tensor, iwo_groups);
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, args1.odesc.set(input_transpose, iwo_groups);
cudnn_input_desc, data_algo, &fwd_ws_size)); args1.cdesc.set(dtype, padding_common, strides, dilations, c_groups);
workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search1::Find<T>(args1, false, deterministic, 0, ctx);
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
} }
if (filter_grad) { if (filter_grad) {
// choose backward algorithm for filter filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE_CUDA_SUCCESS( args2.handle = handle;
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args2.idesc.set(transformed_output_grad, iwo_groups);
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups);
cudnn_filter_desc, args2.odesc.set(input_transpose, iwo_groups);
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, args2.cdesc.set(dtype, padding_common, strides, dilations, c_groups);
workspace_size_limit, &filter_algo)); using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search2::Find<T>(args2, false, deterministic, 1, ctx);
if (FLAGS_cudnn_deterministic) { workspace_size = std::max(workspace_size,
filter_algo = static_cast<cudnnConvolutionBwdFilterAlgo_t>(1); search2::GetWorkspaceSize(args2, filter_algo));
}
// get workspace for backwards filter algorithm
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
workspace_size_in_bytes =
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
} }
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
...@@ -517,19 +481,18 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -517,19 +481,18 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T alpha = static_cast<T>(1.0), beta = static_cast<T>(0.0); T alpha = static_cast<T>(1.0), beta = static_cast<T>(0.0);
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnConvolutionForward( platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc, handle, &alpha, args1.idesc.desc(),
output_grad_data + output_grad_offset * g, cudnn_filter_desc, output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
filter_data + filter_offset * g, cudnn_conv_desc, data_algo, filter_data + filter_offset * g, args1.cdesc.desc(),
cudnn_workspace, workspace_size_in_bytes, &beta, data_algo, cudnn_workspace, workspace_size, &beta,
cudnn_input_desc, input_grad_data + input_offset * g)); args1.odesc.desc(), input_grad_data + input_offset * g));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size);
} }
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
...@@ -553,20 +516,19 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -553,20 +516,19 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter // Gradient with respect to the filter
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnConvolutionBackwardFilter( platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc, handle, &alpha, args2.idesc.desc(),
output_grad_data + output_grad_offset * g, cudnn_input_desc, output_grad_data + output_grad_offset * g, args2.odesc.desc(),
input_data + input_offset * g, cudnn_conv_desc, filter_algo, input_data + input_offset * g, args2.cdesc.desc(),
cudnn_workspace, workspace_size_in_bytes, &beta, filter_algo, cudnn_workspace, workspace_size, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g)); args2.wdesc.desc(), filter_grad_data + filter_offset * g));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size);
} }
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册