!622 GPU update conv kernels to accelerate computing fp16 data

Merge pull request !622 from VectorSL/update-conv-for-amp

!622 GPU update conv kernels to accelerate computing fp16 data
Merge pull request !622 from VectorSL/update-conv-for-amp
92e0dc0a · mindspore-ci-bot · Gitee · 78bbacda · ee7a6401 · 92e0dc0a
4 changed file
--- a/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.h
@@ -218,7 +218,7 @@ class BinaryOpGpuKernel : public GpuKernel {
      }
    }
    CHECK_CUDNN_RET_WITH_EXCEPT(
-      cudnnSetOpTensorDescriptor(opTensor_descriptor_, tensor_op_, cudnn_data_type_, CUDNN_NOT_PROPAGATE_NAN),
+      cudnnSetOpTensorDescriptor(opTensor_descriptor_, tensor_op_, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN),
      "cudnnSetOpTensorDescriptor failed");
    return;
  }

--- a/mindspore/ccsrc/kernel/gpu/nn/conv2d_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/conv2d_gpu_kernel.h
@@ -142,10 +142,14 @@ class Conv2dGpuFwdKernel : public GpuKernel {
      }
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_, stride_, dilation_, dilation_,
-                                        CUDNN_CROSS_CORRELATION, cudnn_data_type_),
+                                        CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
        "cudnnSetConvolution2dDescriptor failed");
      input_descriptor_real = input_desc_;
    }
+    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
+      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
+                                  "cudnnSetConvolutionMathType failed.")
+    }
    SelectAlgorithm(input_descriptor_real);
    InitSizeLists();
    return true;
@@ -240,7 +244,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
                                "cudnnSetTensor4dDescriptor failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(
      cudnnSetConvolution2dDescriptor(conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_, stride_,
-                                      dilation_, dilation_, CUDNN_CROSS_CORRELATION, cudnn_data_type_),
+                                      dilation_, dilation_, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
      "cudnnSetConvolution2dDescriptor failed");
  }

@@ -276,6 +280,9 @@ class Conv2dGpuFwdKernel : public GpuKernel {
        "cudnnGetConvolutionForwardAlgorithm_v7 failed");
      conv_algorithm_ = perf_results.algo;
    }
+    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
+      conv_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    }
  }
  cudnnHandle_t cudnn_handle_;
  cudnnTensorDescriptor_t input_desc_;

--- a/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_filter_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_filter_gpu_kernel.h
@@ -141,10 +141,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
      }
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_, stride_, dilation_, dilation_,
-                                        CUDNN_CROSS_CORRELATION, cudnn_data_type_),
+                                        CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
        "GetConvolution2dDescriptor failed");
      x_desc_real = x_desc_;
    }
+    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
+      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
+                                  "cudnnSetConvolutionMathType failed.")
+    }
    SelectAlgorithm(x_desc_real);
    InitSizeLists();
    return true;
@@ -239,7 +243,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
                                "cudnnSetTensor4dDescriptor failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(
      cudnnSetConvolution2dDescriptor(conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_, stride_,
-                                      dilation_, dilation_, CUDNN_CROSS_CORRELATION, cudnn_data_type_),
+                                      dilation_, dilation_, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
      "cudnnSetConvolution2dDescriptor failed");
  }
  void SelectAlgorithm(cudnnTensorDescriptor_t x_desc_real) {
@@ -258,6 +262,9 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
        "GetConvolutionBackwardFilterAlgorithm failed");
      algo_ = perf_results.algo;
    }
+    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
+      algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+    }
  }
  void GetFilterShape(const CNodePtr &kernel_node, std::vector<int> *filter_shape) {
    auto shp_tuple_x = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("filter_sizes")->cast<ValueTuplePtr>()->value();

--- a/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_input_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/conv2d_grad_input_gpu_kernel.h
@@ -142,10 +142,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
      }
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_, stride_, dilation_, dilation_,
-                                        CUDNN_CROSS_CORRELATION, cudnn_data_type_),
+                                        CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
        "cudnnSetConvolution2dDescriptor failed");
      dx_desc_real = dx_desc_;
    }
+    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
+      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
+                                  "cudnnSetConvolutionMathType failed.")
+    }
    SelectAlgorithm(dx_desc_real);
    InitSizeLists();
    return true;
@@ -239,7 +243,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
                                "cudnnSetTensor4dDescriptor failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(
      cudnnSetConvolution2dDescriptor(conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_, stride_,
-                                      dilation_, dilation_, CUDNN_CROSS_CORRELATION, cudnn_data_type_),
+                                      dilation_, dilation_, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
      "cudnnSetConvolution2dDescriptor failed");
  }
  void SelectAlgorithm(cudnnTensorDescriptor_t dx_desc_real) {
@@ -258,6 +262,9 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
        "cudnnGetConvolutionBackwardDataAlgorithm_v7 failed");
      algo_ = perf_results.algo;
    }
+    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
+      algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+    }
  }
  void GetInputShape(const CNodePtr &kernel_node, std::vector<int> *input_shape) {
    auto shp_tuple_x = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("input_sizes")->cast<ValueTuplePtr>()->value();