diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 34ea6a91ce7743462d378cf471a5ec3a12ca51d1..5518ebed3f792a5acdfbb27976bc2c6dbd78069a 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -89,6 +89,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       XGrad->mutable_data<T>(context.GetPlace())));
 }
 
+template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 47cb336d87f8627d86ac33d6ac32c04d5d93f753..5596fa0648ccc151bc0d11de9c556599428a8d71 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -56,7 +56,9 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
+                   ops::SoftmaxCUDNNKernel<float>,
+                   ops::SoftmaxCUDNNKernel<plat::float16>);
+REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxGradCUDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 4c8326eeabcc56b32755fc49a6df2f1cb192af78..e2c0f915d96b7746191572fa27b725d90cb6e2e5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -41,29 +44,30 @@ class SoftmaxOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
-    }
-#endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
+#endif
 #ifdef PADDLE_WITH_MKLDNN
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
     }
 #endif
+
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+                        "float16 can only be used when CUDNN is used");
+    }
+
     std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                   framework::StringToDataLayout(data_format),
+                                   library_);
   }
 };
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -130,19 +134,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
-    }
-#endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
+#endif
     std::string data_format = ctx.Attr<std::string>("data_format");
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index d32c719a5f137a96a8af6c77f46ad14869679455..33d60c7e31ce0817ad26ea1c1c974339936052d3 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -27,22 +27,22 @@ def stable_softmax(x):
 
 class TestSoftmaxOp(OpTest):
     def setUp(self):
-        self.use_mkldnn = False
         self.op_type = "softmax"
         self.use_cudnn = False
-        self.init_op_type()
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
-        }
-        self.outputs = {
-            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
-        }
+        self.use_mkldnn = False
+        self.dtype = np.float32
+        self.init_kernel_type()
+
+        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1, x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
         self.attrs = {
             'use_cudnn': self.use_cudnn,
             'use_mkldnn': self.use_mkldnn
         }
 
-    def init_op_type(self):
+    def init_kernel_type(self):
         pass
 
     def test_check_output(self):
@@ -53,6 +53,8 @@ class TestSoftmaxOp(OpTest):
             self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -62,12 +64,24 @@ class TestSoftmaxOp(OpTest):
 
 
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
 
 
-class TestMKLDNN(TestSoftmaxOp):
-    def init_op_type(self):
+class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
         self.use_mkldnn = True