[CUDA] Add cudnn_softmax. (#4040)

8776f751 · Wilber · GitHub · 737fedeb · 8776f751 · 8776f751
12 changed file
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -8,7 +8,9 @@ nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps})
 nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps})
 nv_library(cuda_type_trans SRCS type_trans.cu DEPS ${cuda_static_deps}) 
 nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
-nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
+nv_library(cudnn_helper SRCS cudnn_helper.cc DEPS ${cuda_static_deps})
+nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans cudnn_helper ${cuda_static_deps})
+nv_library(cudnn_softmax SRCS cudnn_softmax.cc DEPS cudnn_helper ${cuda_static_deps} tensor)
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
 nv_library(cuda_gru_forward  SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
@@ -22,6 +24,7 @@ nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
 set (
 math_cuda
 cudnn_conv
+ cudnn_softmax
 cuda_activation
 cuda_scale
 cuda_type_trans
@@ -35,6 +38,7 @@ set (
 cuda_strided_gemm
 cuda_sequence_padding
 cuda_bias
+ cudnn_helper
 )

 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -15,6 +15,7 @@
 #include "lite/backends/cuda/math/cudnn_conv.h"
 #include "lite/backends/cuda/math/activation.h"
 #include "lite/backends/cuda/math/conv_op_cache_cudnn.h"
+#include "lite/backends/cuda/math/cudnn_helper.h"
 #include "lite/backends/cuda/math/scale.h"
 #include "lite/backends/cuda/math/type_trans.h"

@@ -23,19 +24,6 @@ namespace lite {
 namespace cuda {
 namespace math {

-template <PrecisionType PType>
-cudnnDataType_t GetDataType();
-
-template <>
-cudnnDataType_t GetDataType<PRECISION(kFloat)>() {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetDataType<PRECISION(kFP16)>() {
-  return CUDNN_DATA_HALF;
-}
-
 template <typename T, PrecisionType Ptype_out>
 bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
                                       Context<TARGET(kCUDA)>* ctx) {
@@ -67,13 +55,13 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,

  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_,
                                         CUDNN_TENSOR_NCHW,
-                                         GetDataType<Ptype_out>(),
+                                         GetCudnnDataType<Ptype_out>(),
                                         batch,
                                         ic,
                                         ih,
                                         iw));
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(this->filter_desc_,
-                                         GetDataType<Ptype_out>(),
+                                         GetCudnnDataType<Ptype_out>(),
                                         CUDNN_TENSOR_NCHW,
                                         oc,
                                         ic / param.groups,
@@ -87,11 +75,11 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
                                              dh,
                                              dw,
                                              CUDNN_CROSS_CORRELATION,
-                                              GetDataType<Ptype_out>()));
+                                              GetCudnnDataType<Ptype_out>()));
  CUDNN_CHECK(cudnnSetConvolutionGroupCount(this->conv_desc_, param.groups));
  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_,
                                         CUDNN_TENSOR_NCHW,
-                                         GetDataType<Ptype_out>(),
+                                         GetCudnnDataType<Ptype_out>(),
                                         batch,
                                         oc,
                                         oh,
@@ -190,8 +178,11 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
  if (param.bias) {
    int dim_bias[] = {1, oc, 1, 1};
    int stride_bias[] = {oc, 1, 1, 1};
-    cudnnSetTensorNdDescriptor(
-        this->bias_desc_, GetDataType<Ptype_out>(), 4, dim_bias, stride_bias);
+    cudnnSetTensorNdDescriptor(this->bias_desc_,
+                               GetCudnnDataType<Ptype_out>(),
+                               4,
+                               dim_bias,
+                               stride_bias);
  }
  return true;
 }

--- a/lite/backends/cuda/math/cudnn_helper.cc
+++ b/lite/backends/cuda/math/cudnn_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/cudnn_helper.h"
+
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <>
+cudnnDataType_t GetCudnnDataType<PRECISION(kFloat)>() {
+  return CUDNN_DATA_FLOAT;
+}
+
+template <>
+cudnnDataType_t GetCudnnDataType<PRECISION(kFP16)>() {
+  return CUDNN_DATA_HALF;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/cudnn_helper.h
+++ b/lite/backends/cuda/math/cudnn_helper.h
@@ -13,12 +13,23 @@
 // limitations under the License.

 #pragma once
+#include <cudnn.h>
+
 #include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"

 namespace paddle {
 namespace lite {
 namespace cuda {
-namespace math {}  // namespace math
+namespace math {
+
+template <lite_api::PrecisionType PType>
+cudnnDataType_t GetCudnnDataType();
+
+}  // namespace math
 }  // namespace cuda
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/cuda/math/cudnn_softmax.cc
+++ b/lite/backends/cuda/math/cudnn_softmax.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/cudnn_softmax.h"
+
+#include "lite/backends/cuda/math/cudnn_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T, PrecisionType Ptype>
+bool CudnnSoftmax<T, Ptype>::Init(const operators::SoftmaxParam& param,
+                                  Context<TARGET(kCUDA)>* ctx) {
+  this->stream_ = ctx->exec_stream();
+  CUDNN_CHECK(cudnnCreate(&this->handle_));
+  CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_));
+
+  cudnnCreateTensorDescriptor(&this->bottom_desc_);
+  cudnnCreateTensorDescriptor(&this->top_desc_);
+
+  return Create(param, ctx);
+}
+
+template <typename T, PrecisionType Ptype>
+bool CudnnSoftmax<T, Ptype>::Create(const operators::SoftmaxParam& param,
+                                    Context<TARGET(kCUDA)>* ctx) {
+  int axis = param.axis;
+  if (axis < 0) {
+    axis += param.x->dims().size();
+  }
+  int outer_num = param.x->dims().count(0, axis);
+  int inner_num = param.x->dims().count(axis + 1, param.x->dims().size());
+
+  int N = outer_num;
+  int C = param.x->dims()[axis];
+  int H = inner_num;
+  int W = 1;
+
+  const int stride_w = 1;
+  const int stride_h = W * stride_w;
+  const int stride_c = H * stride_h;
+  const int stride_n = C * stride_c;
+  CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(bottom_desc_,
+                                           GetCudnnDataType<Ptype>(),
+                                           N,
+                                           C,
+                                           H,
+                                           W,
+                                           stride_n,
+                                           stride_c,
+                                           stride_h,
+                                           stride_w));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(top_desc_,
+                                           GetCudnnDataType<Ptype>(),
+                                           N,
+                                           C,
+                                           H,
+                                           W,
+                                           stride_n,
+                                           stride_c,
+                                           stride_h,
+                                           stride_w));
+  handle_setup_ = true;
+  return true;
+}
+
+template <typename T, PrecisionType Ptype>
+bool CudnnSoftmax<T, Ptype>::Run(const operators::SoftmaxParam& param) {
+  T* output_data = param.output->mutable_data<T>(TARGET(kCUDA));
+  const T* input_data = param.x->data<T>();
+  float alpha = 1.0f;
+  float beta = 0.f;
+  CUDNN_CHECK(cudnnSoftmaxForward(handle_,
+                                  CUDNN_SOFTMAX_ACCURATE,
+                                  CUDNN_SOFTMAX_MODE_CHANNEL,
+                                  &alpha,
+                                  bottom_desc_,
+                                  reinterpret_cast<const void*>(input_data),
+                                  &beta,
+                                  top_desc_,
+                                  reinterpret_cast<void*>(output_data)));
+
+  return true;
+}
+
+template class CudnnSoftmax<float, PRECISION(kFloat)>;
+template class CudnnSoftmax<half, PRECISION(kFP16)>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/cudnn_softmax.h
+++ b/lite/backends/cuda/math/cudnn_softmax.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T, PrecisionType Ptype>
+class CudnnSoftmax {
+ public:
+  CudnnSoftmax()
+      : handle_(nullptr),
+        bottom_desc_(nullptr),
+        top_desc_(nullptr),
+        handle_setup_(false) {}
+  virtual ~CudnnSoftmax() {
+    if (!handle_setup_) return;
+    cudnnDestroyTensorDescriptor(bottom_desc_);
+    cudnnDestroyTensorDescriptor(top_desc_);
+    cudnnDestroy(handle_);
+  }
+  bool Init(const operators::SoftmaxParam& param, Context<TARGET(kCUDA)>* ctx);
+
+  bool Create(const operators::SoftmaxParam& param,
+              Context<TARGET(kCUDA)>* ctx);
+
+  bool Run(const operators::SoftmaxParam& param);
+
+ private:
+  cudaStream_t stream_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  bool handle_setup_;
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -30,7 +30,7 @@ add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_cuda CUDA basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
-add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps} cudnn_pool)
 add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})


--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
@@ -12,6 +12,8 @@ limitations under the License. */
 #pragma once
 #include <limits>
 #include <vector>
+
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/softmax_compute.h"

@@ -21,8 +23,6 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;

-const int CUDA_NUM_THREADS = 512;
-
 extern __shared__ char tile[];
 template <typename dtype>
 __global__ void sharemem_softmax_kernel(int total_size,
@@ -73,6 +73,69 @@ __global__ void sharemem_softmax_kernel(int total_size,
  }
 }

+template <>
+__global__ void sharemem_softmax_kernel(int total_size,
+                                        const half* in_data,
+                                        half* out_data,
+                                        int inner_num,
+                                        int outer_num,
+                                        int axis_size) {
+  half* data = reinterpret_cast<half*>(tile) + threadIdx.x;
+  //! compute thread index and real data index
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < total_size) {
+    int idx_inner = idx % inner_num;
+    int idx_outer = (idx / inner_num) * axis_size;
+    int blocksize = blockDim.x;
+    int real_index = idx_outer * inner_num + idx_inner;
+    int loop_idx = real_index;
+//! read all data to sharemem in softmax channel
+#pragma unroll
+    for (int i = 0; i < axis_size; ++i) {
+      data[i * blocksize] = in_data[loop_idx];
+      loop_idx += inner_num;
+    }
+    //! get maximum value in softmax channel
+    half max_data = data[0];
+#pragma unroll
+    for (int i = 1; i < axis_size; ++i) {
+      half dt = data[i * blocksize];
+#if __CUDA_ARCH__ >= 530
+      if (__hlt(max_data, dt)) {
+#else
+      if (__half2float(max_data) < __half2float(dt)) {
+#endif
+        max_data = dt;
+      }
+    }
+    //! subtract then summarize
+    half sum = 0;
+#pragma unroll
+    for (int i = 0; i < axis_size; ++i) {
+      half* dt = data + i * blocksize;
+#if __CUDA_ARCH__ >= 530
+      *dt = hexp(__hsub(*dt, max_data));
+      sum = __hadd(sum, *dt);
+#else
+      *dt = __float2half(expf(__half2float(*dt) - __half2float(max_data)));
+      sum = __float2half(__half2float(sum) + __half2float(*dt));
+#endif
+    }
+    //! write back result
+    loop_idx = real_index;
+#pragma unroll
+    for (int i = 0; i < axis_size; ++i) {
+#if __CUDA_ARCH__ >= 530
+      out_data[loop_idx] = __hdiv(data[i * blocksize], sum);
+#else
+      out_data[loop_idx] =
+          __float2half(__half2float(data[i * blocksize]) / __half2float(sum));
+#endif
+      loop_idx += inner_num;
+    }
+  }
+}
+
 //! general kernel for softmax
 template <typename dtype>
 __global__ void softmax_max_kernel(int total_size,
@@ -99,6 +162,38 @@ __global__ void softmax_max_kernel(int total_size,
  }
 }

+template <>
+__global__ void softmax_max_kernel(int total_size,
+                                   const half* in_data,
+                                   half* out_data,
+                                   half min_data,
+                                   int inner_num,
+                                   int outer_num,
+                                   int axis_size) {
+  //! compute data index
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < total_size) {
+    int idx_inner = idx % inner_num;
+    int idx_outer = (idx / inner_num) * axis_size;
+    int real_index = idx_outer * inner_num + idx_inner;
+    //! get maximum data across softmax axis
+    half max_data = min_data;
+    for (int i = 0; i < axis_size; ++i) {
+#if __CUDA_ARCH__ >= 530
+      max_data =
+          __hgt(in_data[real_index], max_data) ? in_data[real_index] : max_data;
+#else
+      float a = __half2float(in_data[real_index]);
+      float b = __half2float(max_data);
+      float res = a > b ? a : b;
+      max_data = __float2half(res);
+#endif
+      real_index += inner_num;
+    }
+    out_data[idx] = max_data;
+  }
+}
+
 template <typename dtype>
 __global__ void softmax_sub_exp_sum_kernel(int total_size,
                                           const dtype* in_data,
@@ -129,6 +224,44 @@ __global__ void softmax_sub_exp_sum_kernel(int total_size,
  }
 }

+template <>
+__global__ void softmax_sub_exp_sum_kernel(int total_size,
+                                           const half* in_data,
+                                           half* out_data,
+                                           const half* max_data,
+                                           half* sum_data,
+                                           int inner_num,
+                                           int outer_num,
+                                           int axis_size) {
+  //! compute data index
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < total_size) {
+    int idx_inner = idx % inner_num;
+    int idx_outer = (idx / inner_num) * axis_size;
+
+    half max_data_cur = max_data[idx];
+    half sum_data_cur = 0;
+    int real_index = idx_outer * inner_num + idx_inner;
+    //! compute exp and summarize across the softmax axis
+    for (int i = 0; i < axis_size; ++i) {
+#if __CUDA_ARCH__ >= 530
+      half sub_data = __hsub(in_data[real_index], max_data_cur);
+      sub_data = hexp(sub_data);
+      sum_data_cur = __hadd(sum_data_cur, sub_data);
+#else
+      half sub_data = __float2half(__half2float(in_data[real_index]) -
+                                   __half2float(max_data_cur));
+      sub_data = __float2half(expf(__half2float(sub_data)));
+      sum_data_cur =
+          __float2half(__half2float(sum_data_cur) + __half2float(sub_data));
+#endif
+      out_data[real_index] = sub_data;
+      real_index += inner_num;
+    }
+    sum_data[idx] = sum_data_cur;
+  }
+}
+
 template <typename dtype>
 __global__ void softmax_divid_output_kernel(int total_size,
                                            dtype* io_data,
@@ -151,75 +284,122 @@ __global__ void softmax_divid_output_kernel(int total_size,
  }
 }

-void SoftmaxCompute::PrepareForRun() {
+template <>
+__global__ void softmax_divid_output_kernel(int total_size,
+                                            half* io_data,
+                                            const half* sum_data,
+                                            int inner_num,
+                                            int outer_num,
+                                            int axis_size) {
+  //! compute data index
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < total_size) {
+    int idx_inner = idx % inner_num;
+    int idx_outer = (idx / inner_num) * axis_size;
+#if __CUDA_ARCH__ >= 530
+    half sum_data_cur = __hdiv(__float2half(1.f), sum_data[idx]);
+#else
+    half sum_data_cur = __float2half(1.f / __half2float(sum_data[idx]));
+#endif
+    int real_index = idx_outer * inner_num + idx_inner;
+    //! compute final result
+    for (int i = 0; i < axis_size; ++i) {
+#if __CUDA_ARCH__ >= 530
+      io_data[real_index] = __hmul(io_data[real_index], sum_data_cur);
+#else
+      io_data[real_index] = __float2half(__half2float(io_data[real_index]) *
+                                         __half2float(sum_data_cur));
+#endif
+      real_index += inner_num;
+    }
+  }
+}
+
+template <typename Dtype, PrecisionType Ptype>
+void SoftmaxCompute<Dtype, Ptype>::PrepareForRun() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
  int device_id;
  cudaGetDevice(&device_id);
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, device_id);
  sharedmem_size_ = deviceProp.sharedMemPerBlock;
  max_dimsize_ = sharedmem_size_ / sizeof(float) / CUDA_NUM_THREADS;
+  if (param.use_cudnn) {
+    cudnn_softmax_.Init(param, &ctx);
+  }
 }

-void SoftmaxCompute::Run() {
-  auto& param = this->Param<param_t>();
+template <typename Dtype, PrecisionType Ptype>
+void SoftmaxCompute<Dtype, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
  auto& ctx = this->ctx_->template As<CUDAContext>();
  auto stream = ctx.exec_stream();
-
-  auto x_dims = param.x->dims();
-  auto x_rank = x_dims.size();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int total_threads = inner_num * outer_num;
-  axis_size_ = x_dims[axis];
-
-  const int threads = CUDA_NUM_THREADS;
-  const int blocks = (total_threads + threads - 1) / threads;
-  auto input_data = param.x->data<float>();
-  auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
-  if (axis_size_ <= max_dimsize_) {
-    int use_sharemem_size = axis_size_ * threads * sizeof(float);
-    sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
-        total_threads,
-        input_data,
-        output_data,
-        inner_num,
-        outer_num,
-        axis_size_);
+  if (param.use_cudnn) {
+    cudnn_softmax_.Create(param, &ctx);
+    cudnn_softmax_.Run(param);
  } else {
-    //! re_alloc device memory
-    tmax_data_.Resize({1, 1, 1, outer_num * inner_num});
-    tsum_data_.Resize({1, 1, 1, outer_num * inner_num});
-    auto max_data = tmax_data_.mutable_data<float>(TARGET(kCUDA));
-    auto sum_data = tsum_data_.mutable_data<float>(TARGET(kCUDA));
-    //! firstly, get maximum data
-    float min_data = std::numeric_limits<float>::lowest();
-    softmax_max_kernel<float><<<blocks, threads, 0, stream>>>(total_threads,
-                                                              input_data,
-                                                              max_data,
-                                                              min_data,
-                                                              inner_num,
-                                                              outer_num,
-                                                              axis_size_);
-    //! then, compute exp and sum data
-    softmax_sub_exp_sum_kernel<float><<<blocks, threads, 0, stream>>>(
-        total_threads,
-        input_data,
-        output_data,
-        max_data,
-        sum_data,
-        inner_num,
-        outer_num,
-        axis_size_);
-    //! last, compute divided output
-    softmax_divid_output_kernel<float><<<blocks, threads, 0, stream>>>(
-        total_threads, output_data, sum_data, inner_num, outer_num, axis_size_);
+    auto x_dims = param.x->dims();
+    auto x_rank = x_dims.size();
+    int axis = param.axis;
+    if (axis < 0) {
+      axis += x_rank;
+    }
+    int outer_num = x_dims.Slice(0, axis).production();
+    int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+    int total_threads = inner_num * outer_num;
+    axis_size_ = x_dims[axis];
+
+    const int threads = CUDA_NUM_THREADS;
+    const int blocks = (total_threads + threads - 1) / threads;
+    auto input_data = param.x->template data<Dtype>();
+    auto output_data =
+        param.output->template mutable_data<Dtype>(TARGET(kCUDA));
+    if (axis_size_ <= max_dimsize_) {
+      int use_sharemem_size = axis_size_ * threads * sizeof(Dtype);
+      sharemem_softmax_kernel<
+          Dtype><<<blocks, threads, use_sharemem_size, stream>>>(total_threads,
+                                                                 input_data,
+                                                                 output_data,
+                                                                 inner_num,
+                                                                 outer_num,
+                                                                 axis_size_);
+    } else {
+      //! re_alloc device memory
+      tmax_data_.Resize({1, 1, 1, outer_num * inner_num});
+      tsum_data_.Resize({1, 1, 1, outer_num * inner_num});
+      auto max_data = tmax_data_.mutable_data<Dtype>(TARGET(kCUDA));
+      auto sum_data = tsum_data_.mutable_data<Dtype>(TARGET(kCUDA));
+      //! firstly, get maximum data
+      float min_data = std::numeric_limits<float>::lowest();
+      softmax_max_kernel<Dtype><<<blocks, threads, 0, stream>>>(total_threads,
+                                                                input_data,
+                                                                max_data,
+                                                                min_data,
+                                                                inner_num,
+                                                                outer_num,
+                                                                axis_size_);
+      //! then, compute exp and sum data
+      softmax_sub_exp_sum_kernel<Dtype><<<blocks, threads, 0, stream>>>(
+          total_threads,
+          input_data,
+          output_data,
+          max_data,
+          sum_data,
+          inner_num,
+          outer_num,
+          axis_size_);
+      //! last, compute divided output
+      softmax_divid_output_kernel<Dtype><<<blocks, threads, 0, stream>>>(
+          total_threads,
+          output_data,
+          sum_data,
+          inner_num,
+          outer_num,
+          axis_size_);
+    }
  }
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

 }  // namespace cuda
@@ -227,12 +407,12 @@ void SoftmaxCompute::Run() {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(softmax,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::SoftmaxCompute,
-                     def)
+using SoftmaxFp32 =
+    paddle::lite::kernels::cuda::SoftmaxCompute<float, PRECISION(kFloat)>;
+using SoftmaxFp16 =
+    paddle::lite::kernels::cuda::SoftmaxCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(softmax, kCUDA, kFloat, kNCHW, SoftmaxFp32, def)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kCUDA),
                                      PRECISION(kFloat),
@@ -246,12 +426,19 @@ REGISTER_LITE_KERNEL(softmax,
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW))})
    .Finalize();
-REGISTER_LITE_KERNEL(search_seq_softmax,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::SoftmaxCompute,
-                     def)
+REGISTER_LITE_KERNEL(softmax, kCUDA, kFP16, kNCHW, SoftmaxFp16, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_log",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
+REGISTER_LITE_KERNEL(search_seq_softmax, kCUDA, kFloat, kNCHW, SoftmaxFp32, def)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kCUDA),
                                      PRECISION(kFloat),
@@ -262,3 +449,15 @@ REGISTER_LITE_KERNEL(search_seq_softmax,
                                       DATALAYOUT(kNCHW))})
    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kCUDA))})
    .Finalize();
+REGISTER_LITE_KERNEL(search_seq_softmax, kCUDA, kFP16, kNCHW, SoftmaxFp16, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_log",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
--- a/lite/kernels/cuda/softmax_compute.h
+++ b/lite/kernels/cuda/softmax_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include "lite/backends/cuda/math/cudnn_softmax.h"
 #include "lite/core/kernel.h"

 namespace paddle {
@@ -20,8 +21,9 @@ namespace lite {
 namespace kernels {
 namespace cuda {

+template <typename Dtype, PrecisionType Ptype>
 class SoftmaxCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+    : public KernelLite<TARGET(kCUDA), Ptype, DATALAYOUT(kNCHW)> {
 public:
  using param_t = operators::SoftmaxParam;

@@ -30,6 +32,7 @@ class SoftmaxCompute
  virtual ~SoftmaxCompute() = default;

 private:
+  lite::cuda::math::CudnnSoftmax<Dtype, Ptype> cudnn_softmax_;
  lite::Tensor tmax_data_;
  lite::Tensor tsum_data_;
  size_t sharedmem_size_;

--- a/lite/kernels/cuda/softmax_compute_test.cc
+++ b/lite/kernels/cuda/softmax_compute_test.cc
@@ -20,114 +20,192 @@
 #include <utility>
 #include <vector>

+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {

-using Tensor = lite::Tensor;
-using DDim = lite::DDim;
-
-template <typename dtype>
-static void softmax_compute_ref(const operators::SoftmaxParam& param) {
-  const dtype* x_data = param.x->mutable_data<const dtype>();
-  dtype* output_data = param.output->mutable_data<dtype>();
-  DDim x_dims = param.x->dims();
-  ASSERT_EQ(x_dims.data(), param.output->dims().data());
-  auto x_rank = x_dims.size();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += x_rank;
+class SoftmaxTest : public ::testing::Test {
+ protected:
+  SoftmaxTest()
+      : n_(2),
+        c_(2),
+        h_(2),
+        w_(2),
+        axis_(1),
+        use_cudnn_(true),
+        shape_({n_, c_, h_, w_}) {
+    x_ref_.Resize(lite::DDim(shape_));
+    x_gpu_.Resize(lite::DDim(shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+
+    out_ref_.Resize(lite::DDim(shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine();
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.x = &x_gpu_;
+    param_.axis = axis_;
+    param_.output = &out_gpu_;
+    param_.use_cudnn = use_cudnn_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
  }
-  int axis_size = x_dims[axis];
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int compute_size = outer_num * inner_num;
-  for (int i = 0; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int start = idx_outer * inner_num + idx_inner;
-    int offset;
-
-    offset = start;
-    dtype max_data = std::numeric_limits<dtype>::lowest();
-    for (int j = 0; j < axis_size; j++) {
-      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
-      offset += inner_num;
+
+  void InitHalfInput() {
+    x_half_.Resize(x_ref_.dims());
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+  }

-    offset = start;
-    dtype sum_data = (dtype)0;
-    for (int j = 0; j < axis_size; j++) {
-      output_data[offset] = exp(x_data[offset] - max_data);
-      sum_data += output_data[offset];
-      offset += inner_num;
+  void RunBaseLine() {
+    const float* x_data = x_ref_.mutable_data<float>();
+    float* output_data = out_ref_.mutable_data<float>();
+    DDim x_dims = x_ref_.dims();
+    ASSERT_EQ(x_dims.data(), out_ref_.dims().data());
+    auto x_rank = x_dims.size();
+    int axis = axis_;
+    if (axis < 0) {
+      axis += x_rank;
    }
+    int axis_size = x_dims[axis];
+    int outer_num = x_dims.Slice(0, axis).production();
+    int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+    int compute_size = outer_num * inner_num;
+    for (int i = 0; i < compute_size; i++) {
+      int idx_inner = i % inner_num;
+      int idx_outer = (i / inner_num) * axis_size;
+      int start = idx_outer * inner_num + idx_inner;
+      int offset;
+
+      offset = start;
+      float max_data = std::numeric_limits<float>::lowest();
+      for (int j = 0; j < axis_size; j++) {
+        max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
+        offset += inner_num;
+      }

-    offset = start;
-    for (int j = 0; j < axis_size; j++) {
-      output_data[offset] /= sum_data;
-      offset += inner_num;
+      offset = start;
+      float sum_data = 0.f;
+      for (int j = 0; j < axis_size; j++) {
+        output_data[offset] = exp(x_data[offset] - max_data);
+        sum_data += output_data[offset];
+        offset += inner_num;
+      }
+
+      offset = start;
+      for (int j = 0; j < axis_size; j++) {
+        output_data[offset] /= sum_data;
+        offset += inner_num;
+      }
    }
  }
+
+  int n_, c_, h_, w_, axis_;
+  bool use_cudnn_;
+  std::vector<int64_t> shape_;
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::SoftmaxParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SoftmaxTest, TestFP32) {
+  InitFloatInput();
+  SoftmaxCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
+  }
 }

-TEST(softmax_cuda, compute) {
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  SoftmaxCompute softmax;
-  operators::SoftmaxParam param;
-  softmax.SetContext(std::move(ctx));
-  lite::Tensor x;
-  lite::Tensor x_cpu;
-  lite::Tensor output;
-  lite::Tensor output_cpu;
-  lite::Tensor output_ref;
-  for (auto n : {1, 3}) {
-    for (auto c : {1, 4}) {
-      for (auto h : {5, 1, 112}) {
-        for (auto w : {1, 6, 112}) {
-          for (auto axis : {-2, -1, 0, 1, 2}) {
-            x.Resize({n, c, h, w});
-            x_cpu.Resize({n, c, h, w});
-            output.Resize({n, c, h, w});
-            output_cpu.Resize({n, c, h, w});
-            output_ref.Resize({n, c, h, w});
-            auto* x_cpu_data = x_cpu.mutable_data<float>();
-            auto* output_data = output.mutable_data<float>(TARGET(kCUDA));
-            auto* output_cpu_data = output_ref.mutable_data<float>();
-            auto* output_ref_data = output_ref.mutable_data<float>();
-            for (int i = 0; i < x.dims().production(); i++) {
-              x_cpu_data[i] = i;
-            }
-            x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data,
-                                                       x_cpu.dims());
-            param.x = &x;
-            param.axis = axis;
-            param.output = &output;
-            softmax.SetParam(param);
-            softmax.Launch();
-            param.x = &x_cpu;
-            param.output = &output_ref;
-            softmax_compute_ref<float>(param);
-            cudaDeviceSynchronize();
-            CopySync<TARGET(kCUDA)>(output_cpu_data,
-                                    output_data,
-                                    sizeof(float) * output.numel(),
-                                    IoDirection::DtoH);
-            for (int i = 0; i < output.dims().production(); i++) {
-              EXPECT_NEAR(output_cpu_data[i], output_ref_data[i], 1e-5);
-            }
-          }
-        }
-      }
-    }
+TEST_F(SoftmaxTest, TestFP16) {
+  InitHalfInput();
+  SoftmaxCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
  }
 }
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -268,6 +268,7 @@ struct SoftmaxParam : ParamBase {
  lite::Tensor* x{};
  lite::Tensor* output{};
  int axis{-1};
+  bool use_cudnn{true};
  ///////////////////////////////////////////////////////////////////////////////////
  // get a vector of input tensors
  const std::vector<const Tensor*>* input_tensor_ptrs() override {

--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
@@ -52,6 +52,9 @@ bool SoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
  }
  CHECK(param_.x);
  CHECK(param_.output);
+  if (opdesc.HasAttr("use_cudnn")) {
+    param_.use_cudnn = opdesc.GetAttr<bool>("use_cudnn");
+  }
  return true;
 }