未验证 提交 ed2a1852 编写于 作者: G gongweibao 提交者: GitHub

optimize nhwc for tensor core in ConvOp and ConvGradOp (#20597)

上级 c918788b
......@@ -40,6 +40,10 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout;
static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) {
return dev_ctx.GetComputeCapability() >= 70;
}
template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> {
public:
......@@ -68,11 +72,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = platform::CudnnDataType<T>::type;
// Tensor Core introduced from Volta GPUs supports more faster conv op
// with FP16 in NHWC data format.
const bool compute_in_nhwc =
dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
// We will only do data format conversion from NHWC to NCHW.
// cudnn will convert NCHW to NHWC automatically on Tensor Core.
auto compute_format =
compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
VLOG(3) << "Compute ConvOp with cuDNN:"
<< " data_format=" << data_format << " compute_format="
<< (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
// ------------ transformed tensor -----------
Tensor transformed_input_channel(input->type());
Tensor transformed_output(output->type());
Tensor transformed_filter_channel(filter->type());
T* output_data = nullptr;
if (channel_last) {
if (channel_last && compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input, &transformed_input_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
......@@ -82,19 +102,36 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
&transformed_output);
} else {
transformed_input_channel = *input;
transformed_output = *output;
transformed_input_channel.ShareDataWith(*input);
transformed_output.ShareDataWith(*output);
}
if (compute_format == DataLayout::kNHWC) {
VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
ResizeToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
TransToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
} else {
transformed_filter_channel.ShareDataWith(*filter);
}
output_data = transformed_output.data<T>();
// update padding and dilation
auto in_dims = transformed_input_channel.dims();
auto filter_dims = filter->dims();
auto filter_dims = transformed_filter_channel.dims();
framework::DDim in_data_dims;
in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims;
if (compute_format == DataLayout::kNCHW) {
in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims =
framework::slice_ddim(filter_dims, 2, filter_dims.size());
} else {
in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims =
framework::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
}
framework::DDim filter_data_dims =
framework::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
......@@ -108,17 +145,33 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_input_channel.dims()[0];
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
} else {
new_input_shape_vec[data_dim + 1] =
transformed_input_channel.dims()[data_dim + 1];
}
std::vector<int> input_pad(transformed_input_channel.dims().size() * 2,
0);
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
} else {
new_input_shape_vec[i + 1] =
transformed_input_channel.dims()[i + 1] + padding_diff[i];
}
if (compute_format == DataLayout::kNCHW) {
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
} else {
input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
}
}
framework::DDim new_input_shape(
framework::make_ddim(new_input_shape_vec));
......@@ -147,7 +200,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
}
} else {
transformed_input = transformed_input_channel;
transformed_input.ShareDataWith(transformed_input_channel);
if (paddings.size() == data_dim) {
for (size_t i = 0; i < data_dim; ++i) {
padding_common[i] = paddings[i];
......@@ -160,18 +213,20 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
}
const T* input_data = transformed_input.data<T>();
const T* filter_data = filter->data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
// ------------------- cudnn descriptors ---------------------
ConvArgs args{&transformed_input, filter, &transformed_output, strides,
padding_common, dilations};
ConvArgs args{&transformed_input, &transformed_filter_channel,
&transformed_output, strides,
padding_common, dilations};
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto dtype = platform::CudnnDataType<T>::type;
DataLayout layout = DataLayout::kNCHW;
if (transformed_input_channel.dims().size() == 5) {
layout = DataLayout::kNCDHW;
DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
: DataLayout::kNCDHW;
}
auto layout_format = GetCudnnTensorFormat(layout);
......@@ -186,21 +241,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
args.cdesc.desc(), groups));
groups = 1;
#endif
args.idesc.set(transformed_input, groups);
args.wdesc.set(*filter, layout_format, groups);
args.odesc.set(transformed_output, groups);
args.idesc.set(transformed_input, layout_format);
args.wdesc.set(transformed_filter_channel, layout_format, groups);
args.odesc.set(transformed_output, layout_format);
int i_n, i_c, i_d, i_h, i_w;
GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
&i_h, &i_w);
int o_n, o_c, o_d, o_h, o_w;
GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
&o_h, &o_w);
if (compute_format == DataLayout::kNHWC) {
GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d,
&o_h, &o_w);
} else {
GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
&o_h, &o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = filter->numel() / groups;
int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn conv workspace ---------------------
size_t workspace_size = 0; // final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
......@@ -225,7 +286,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
workspace_size);
}
if (channel_last) {
if (channel_last && compute_format == DataLayout::kNCHW) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_output, output);
}
......@@ -245,7 +306,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
const T* filter_data = filter->data<T>();
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
}
......@@ -269,12 +329,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = platform::CudnnDataType<T>::type;
const bool compute_in_nhwc =
dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
auto compute_format =
compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
VLOG(3) << "Compute ConvGradOp with cuDNN:"
<< " data_format=" << data_format << " compute_format="
<< (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
// transform Tensor
Tensor transformed_input_channel(input->type());
Tensor transformed_output_grad_channel(output_grad->type());
Tensor transformed_input_grad_channel(input->type());
Tensor transformed_filter_channel(filter->type());
Tensor transformed_filter_grad_channel(filter->type());
if (channel_last) {
if (channel_last && compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
"NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input, &transformed_input_channel);
TransToChannelFirst<platform::CUDADeviceContext, T>(
......@@ -289,22 +362,46 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(
ctx, input_grad, &transformed_input_grad_channel);
}
} else {
transformed_input_channel = *input;
transformed_output_grad_channel = *output_grad;
transformed_input_channel.ShareDataWith(*input);
transformed_output_grad_channel.ShareDataWith(*output_grad);
if (input_grad) {
transformed_input_grad_channel.ShareDataWith(*input_grad);
}
}
if (compute_format == DataLayout::kNHWC) {
VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
ResizeToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
TransToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter, &transformed_filter_channel);
if (filter_grad) {
ResizeToChannelLast<platform::CUDADeviceContext, T>(
ctx, filter_grad, &transformed_filter_grad_channel);
}
} else {
transformed_filter_channel.ShareDataWith(*filter);
if (filter_grad) {
transformed_filter_grad_channel.ShareDataWith(*filter_grad);
}
}
// update paddings
auto in_dims = transformed_input_channel.dims();
auto filter_dims = filter->dims();
auto filter_dims = transformed_filter_channel.dims();
framework::DDim in_data_dims;
in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
framework::DDim filter_data_dims =
framework::slice_ddim(filter_dims, 2, filter_dims.size());
framework::DDim filter_data_dims;
if (compute_format == DataLayout::kNCHW) {
in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims =
framework::slice_ddim(filter_dims, 2, filter_dims.size());
} else {
in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims =
framework::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
}
std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
......@@ -323,15 +420,30 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
std::vector<int> padding_diff(data_dim);
std::vector<int> new_input_shape_vec(data_dim + 2);
new_input_shape_vec[0] = transformed_input_channel.dims()[0];
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[1] = transformed_input_channel.dims()[1];
} else {
new_input_shape_vec[data_dim + 1] =
transformed_input_channel.dims()[data_dim + 1];
}
for (size_t i = 0; i < data_dim; ++i) {
padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
if (compute_format == DataLayout::kNCHW) {
new_input_shape_vec[i + 2] =
transformed_input_channel.dims()[i + 2] + padding_diff[i];
} else {
new_input_shape_vec[i + 1] =
transformed_input_channel.dims()[i + 1] + padding_diff[i];
}
if (compute_format == DataLayout::kNCHW) {
input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
} else {
input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
}
}
framework::DDim new_input_shape(
framework::make_ddim(new_input_shape_vec));
......@@ -384,42 +496,51 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
const T* input_data = transformed_input.data<T>();
const T* output_grad_data = transformed_output_grad_channel.data<T>();
const T* filter_data = transformed_filter_channel.data<T>();
T* filter_grad_data = nullptr;
T* input_grad_data = nullptr;
T* transformed_input_grad_data = nullptr;
ConvArgs args1{&transformed_input_grad,
filter,
&transformed_filter_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations};
ConvArgs args2{&transformed_input,
filter_grad,
&transformed_filter_grad_channel,
&transformed_output_grad_channel,
strides,
padding_common,
dilations};
auto handle = dev_ctx.cudnn_handle();
auto dtype = platform::CudnnDataType<T>::type;
DataLayout layout = DataLayout::kNCHW;
if (input->dims().size() == 5) {
layout = DataLayout::kNCDHW;
DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
if (transformed_input.dims().size() == 5) {
layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
: DataLayout::kNCDHW;
}
auto layout_tensor = GetCudnnTensorFormat(layout);
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
int i_n, i_c, i_d, i_h, i_w;
GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
&i_h, &i_w);
int o_n, o_c, o_d, o_h, o_w;
GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n,
&o_c, &o_d, &o_h, &o_w);
if (compute_format == DataLayout::kNHWC) {
GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n,
&o_c, &o_d, &o_h, &o_w);
} else {
GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
&i_h, &i_w);
GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n,
&o_c, &o_d, &o_h, &o_w);
}
int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = filter->numel() / groups;
int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn backward algorithm ---------------------
cudnnConvolutionBwdDataAlgo_t data_algo =
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
......@@ -439,9 +560,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
input_grad_data = input_grad->data<T>();
transformed_input_grad_data = transformed_input_grad.data<T>();
args1.handle = handle;
args1.idesc.set(transformed_input_grad, iwo_groups);
args1.wdesc.set(*filter, layout_tensor, iwo_groups);
args1.odesc.set(transformed_output_grad_channel, iwo_groups);
args1.idesc.set(transformed_input_grad, layout_tensor);
args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
args1.odesc.set(transformed_output_grad_channel, layout_tensor);
args1.cdesc.set(dtype, padding_common, strides, dilations, c_groups);
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
......@@ -453,11 +574,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if (filter_grad) {
// ------------------- cudnn descriptors ---------------------
filter_grad_data = filter_grad->data<T>();
filter_grad_data = transformed_filter_grad_channel.data<T>();
args2.handle = handle;
args2.idesc.set(transformed_input, iwo_groups);
args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups);
args2.odesc.set(transformed_output_grad_channel, iwo_groups);
args2.idesc.set(transformed_input, layout_tensor);
args2.wdesc.set(transformed_filter_grad_channel, layout_tensor,
iwo_groups);
args2.odesc.set(transformed_output_grad_channel, layout_tensor);
args2.cdesc.set(dtype, padding_common, strides, dilations, c_groups);
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
......@@ -506,7 +628,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
}
if (channel_last) {
if (channel_last && compute_format == DataLayout::kNCHW) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_input_grad_channel, input_grad);
}
......@@ -527,6 +649,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
},
workspace_size);
}
if (compute_format == DataLayout::kNHWC) {
TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_filter_grad_channel, filter_grad);
}
}
}
};
......
......@@ -97,13 +97,15 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
filter_dims[0], filter_dims, groups);
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
}
framework::DDim filter_data_dims =
framework::slice_ddim(filter_dims, 2, filter_dims.size());
filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
......@@ -117,9 +119,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
(in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
output_shape.push_back(-1);
} else {
output_shape.push_back(ConvOutputSize(in_data_dims[i], filter_dims[i + 2],
dilations[i], paddings[2 * i],
paddings[2 * i + 1], strides[i]));
output_shape.push_back(
ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i],
paddings[2 * i], paddings[2 * i + 1], strides[i]));
}
}
if (channel_last) {
......@@ -335,7 +337,7 @@ parameters is checked in the infer-shape.
Input(Input) and Output(Output) are in NCHW or NHWC format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature.
Filters(Input) is MCHW format. Where M is the number of output image channels, C is
Filters(Input) is MCHW format format. Where M is the number of output image channels, C is
the number of input image channels, H is the height of the filter, and W
is the width of the filter.
Parameters(strides, paddings, dilations) are two elements. These two elements represent
......
......@@ -154,6 +154,36 @@ inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const framework::ExecutionContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = framework::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(framework::make_ddim(in_dims_vec));
transformed_input->mutable_data<T>(context.GetPlace());
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const framework::ExecutionContext& context,
const Tensor* input,
......
......@@ -34,6 +34,29 @@ inline cudnnDataType_t ToCudnnDataType(const T& t) {
return ToCudnnDataType(type);
}
inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
std::vector<int> transformed_dims(dims.begin(), dims.end());
int H, W, D, C;
if (dims.size() == 4) {
H = dims[1];
W = dims[2];
C = dims[3];
transformed_dims[1] = C;
transformed_dims[2] = H;
transformed_dims[3] = W;
} else {
D = dims[1];
H = dims[2];
W = dims[3];
C = dims[4];
transformed_dims[1] = C;
transformed_dims[2] = D;
transformed_dims[3] = H;
transformed_dims[4] = W;
}
return transformed_dims;
}
template <>
inline cudnnDataType_t ToCudnnDataType(
const framework::proto::VarType::Type& t) {
......@@ -117,6 +140,19 @@ class TensorDescriptor {
dims_with_group.data(), strides.data()));
}
void set(const Tensor& tensor, const cudnnTensorFormat_t format) {
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> transformed_dims;
if (format == CUDNN_TENSOR_NHWC) {
transformed_dims = TransformDimOrder(dims);
} else {
transformed_dims = dims;
}
CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptorEx(
desc_.get(), format, ToCudnnDataType(tensor.type()),
transformed_dims.size(), transformed_dims.data()));
}
private:
std::unique_ptr<T, Deleter> desc_;
};
......@@ -143,12 +179,18 @@ class FilterDescriptor {
void set(const Tensor& tensor, const cudnnTensorFormat_t format,
const int groups = 1) {
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> transformed_dims;
if (format == CUDNN_TENSOR_NHWC) {
transformed_dims = TransformDimOrder(dims);
} else {
transformed_dims = dims;
}
if (groups > 1) {
dims[1] = dims[1] / groups;
transformed_dims[1] = transformed_dims[1] / groups;
}
CUDNN_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
desc_.get(), ToCudnnDataType(tensor.type()), format, dims.size(),
dims.data()));
desc_.get(), ToCudnnDataType(tensor.type()), format,
transformed_dims.size(), transformed_dims.data()));
}
private:
......
......@@ -81,7 +81,6 @@ def conv2d_forward_naive(input,
if len(pad) == 4:
pad_h_0, pad_h_1 = pad[0], pad[1]
pad_w_0, pad_w_1 = pad[2], pad[3]
out_h = 1 + (in_h + pad_h_0 + pad_h_1 - (dilation[0] *
(f_h - 1) + 1)) // stride[0]
out_w = 1 + (in_w + pad_w_0 + pad_w_1 - (dilation[1] *
......@@ -204,6 +203,50 @@ def create_test_cudnn_channel_last_class(parent):
globals()[cls_name] = TestCudnnChannelLastCase
def create_test_cudnn_channel_last_fp16_class(parent, grad_check=True):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCudnnChannelLastFp16(parent):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
def test_check_grad_no_filter(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place) and grad_check:
self.check_grad_with_place(
place, ['Input'],
'Output',
max_relative_error=0.02,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place) and grad_check:
self.check_grad_with_place(
place, ['Filter'],
'Output',
max_relative_error=0.02,
no_grad_set=set(['Input']))
def init_data_format(self):
self.data_format = "NHWC"
def init_test_case_2(self):
N, C, H, W = self.input_size
self.input_size = [N, H, W, C]
cls_name = "{0}_{1}".format(parent.__name__, "CudnnChannelLastFp16")
TestCudnnChannelLastFp16.__name__ = cls_name
globals()[cls_name] = TestCudnnChannelLastFp16
def create_test_padding_SAME_class(parent):
class TestPaddingSMAECase(parent):
def init_paddings(self):
......@@ -699,7 +742,6 @@ class TestConv2dOp_v2(OpTest):
self.init_dilation()
self.init_data_format()
self.init_test_case()
self.init_paddings()
self.init_test_case_2()
......@@ -1195,6 +1237,17 @@ create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
create_test_cudnn_channel_last_fp16_class(
TestConv2dOp_AsyPadding, grad_check=False)
create_test_cudnn_channel_last_fp16_class(
TestWithPad_AsyPadding, grad_check=False)
create_test_cudnn_channel_last_fp16_class(
TestWithStride_AsyPadding, grad_check=False)
create_test_cudnn_channel_last_fp16_class(
TestWithGroup_AsyPadding, grad_check=False)
create_test_cudnn_channel_last_fp16_class(
TestWithDilation_AsyPadding, grad_check=False)
# --------- test python API ---------------
class TestConv2dAPI(OpTest):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册