提交 92e0dc0a 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!622 GPU update conv kernels to accelerate computing fp16 data

Merge pull request !622 from VectorSL/update-conv-for-amp
......@@ -218,7 +218,7 @@ class BinaryOpGpuKernel : public GpuKernel {
}
}
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetOpTensorDescriptor(opTensor_descriptor_, tensor_op_, cudnn_data_type_, CUDNN_NOT_PROPAGATE_NAN),
cudnnSetOpTensorDescriptor(opTensor_descriptor_, tensor_op_, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN),
"cudnnSetOpTensorDescriptor failed");
return;
}
......
......@@ -142,10 +142,14 @@ class Conv2dGpuFwdKernel : public GpuKernel {
}
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_, stride_, dilation_, dilation_,
CUDNN_CROSS_CORRELATION, cudnn_data_type_),
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolution2dDescriptor failed");
input_descriptor_real = input_desc_;
}
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
"cudnnSetConvolutionMathType failed.")
}
SelectAlgorithm(input_descriptor_real);
InitSizeLists();
return true;
......@@ -240,7 +244,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
"cudnnSetTensor4dDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolution2dDescriptor(conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_, stride_,
dilation_, dilation_, CUDNN_CROSS_CORRELATION, cudnn_data_type_),
dilation_, dilation_, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolution2dDescriptor failed");
}
......@@ -276,6 +280,9 @@ class Conv2dGpuFwdKernel : public GpuKernel {
"cudnnGetConvolutionForwardAlgorithm_v7 failed");
conv_algorithm_ = perf_results.algo;
}
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
conv_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
}
cudnnHandle_t cudnn_handle_;
cudnnTensorDescriptor_t input_desc_;
......
......@@ -141,10 +141,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
}
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_, stride_, dilation_, dilation_,
CUDNN_CROSS_CORRELATION, cudnn_data_type_),
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"GetConvolution2dDescriptor failed");
x_desc_real = x_desc_;
}
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
"cudnnSetConvolutionMathType failed.")
}
SelectAlgorithm(x_desc_real);
InitSizeLists();
return true;
......@@ -239,7 +243,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
"cudnnSetTensor4dDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolution2dDescriptor(conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_, stride_,
dilation_, dilation_, CUDNN_CROSS_CORRELATION, cudnn_data_type_),
dilation_, dilation_, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolution2dDescriptor failed");
}
void SelectAlgorithm(cudnnTensorDescriptor_t x_desc_real) {
......@@ -258,6 +262,9 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
"GetConvolutionBackwardFilterAlgorithm failed");
algo_ = perf_results.algo;
}
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
}
void GetFilterShape(const CNodePtr &kernel_node, std::vector<int> *filter_shape) {
auto shp_tuple_x = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("filter_sizes")->cast<ValueTuplePtr>()->value();
......
......@@ -142,10 +142,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
}
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_, stride_, dilation_, dilation_,
CUDNN_CROSS_CORRELATION, cudnn_data_type_),
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolution2dDescriptor failed");
dx_desc_real = dx_desc_;
}
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
"cudnnSetConvolutionMathType failed.")
}
SelectAlgorithm(dx_desc_real);
InitSizeLists();
return true;
......@@ -239,7 +243,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
"cudnnSetTensor4dDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnSetConvolution2dDescriptor(conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_, stride_,
dilation_, dilation_, CUDNN_CROSS_CORRELATION, cudnn_data_type_),
dilation_, dilation_, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
"cudnnSetConvolution2dDescriptor failed");
}
void SelectAlgorithm(cudnnTensorDescriptor_t dx_desc_real) {
......@@ -258,6 +262,9 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
"cudnnGetConvolutionBackwardDataAlgorithm_v7 failed");
algo_ = perf_results.algo;
}
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
}
void GetInputShape(const CNodePtr &kernel_node, std::vector<int> *input_shape) {
auto shp_tuple_x = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("input_sizes")->cast<ValueTuplePtr>()->value();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册