[bf16] add bf16 kernel: softmax & log_softmax (#39999)

* add softmax log_softmax * refine rocm * refine unittest

[bf16] add bf16 kernel: softmax & log_softmax (#39999)
* add softmax log_softmax * refine rocm * refine unittest
4a4215ff · zhangbo9674 · GitHub · c9cd47d9 · 4a4215ff · 4a4215ff
14 changed file
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -13,9 +13,9 @@
 // limitations under the License.
 #include <limits>
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
@@ -311,7 +311,7 @@ void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data,
 template <typename T>
 class LogSoftmaxKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -433,7 +433,7 @@ void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
 template <typename T>
 class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -468,16 +468,18 @@ class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
  }
 };
-}  // operators
+}  // namespace operators
-}  // paddle
+}  // namespace paddle
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
    ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(
    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -120,6 +120,10 @@ template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<platform::bfloat16>;
+template class SoftmaxGradCUDNNFunctor<platform::bfloat16>;
+#endif
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
@@ -131,6 +135,10 @@ template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                              false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                              true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              true>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
@@ -139,9 +147,13 @@ template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
                                  platform::float16>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext,
+                                  platform::bfloat16>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, true>;
 template class SoftmaxFunctor<phi::GPUContext, float, false>;
 template class SoftmaxFunctor<phi::GPUContext, double, false>;
 template class SoftmaxFunctor<phi::GPUContext, float, true>;
@@ -149,6 +161,7 @@ template class SoftmaxFunctor<phi::GPUContext, double, true>;
 template class SoftmaxGradFunctor<phi::GPUContext, float>;
 template class SoftmaxGradFunctor<phi::GPUContext, double>;
 template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, platform::bfloat16>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -156,6 +156,65 @@ class SoftmaxEigen<DeviceContext, platform::float16, is_test> {
  }
 };
+template <typename DeviceContext, bool is_test>
+class SoftmaxEigen<DeviceContext, platform::bfloat16, is_test> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* X, framework::Tensor* Y) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+    auto logits = EigenMatrix<platform::bfloat16>::From(*X);
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*Y);
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    }
+    softmax.device(*context.eigen_device()) = softmax.exp();
+    softmax.device(*context.eigen_device()) =
+        (softmax *
+         softmax.reshape(batch_axis_remain)
+             .sum(along_axis)
+             .inverse()
+             .broadcast(one_axis));
+  }
+};
 template <typename DeviceContext, typename T, bool is_test, typename Enable>
 void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
    const DeviceContext& context, const int axis_dim,
@@ -289,6 +348,38 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
  }
 };
+template <typename DeviceContext>
+class SoftmaxGradEigen<DeviceContext, platform::bfloat16> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad) {
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*y);
+    auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad);
+    auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad);
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    const int batch_size = softmax.dimension(kBatchDim);
+    const int num_classes = softmax.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    auto dot = (softmax * softmax_grad)
+                   .reshape(batch_axis_remain)
+                   .sum(along_class)
+                   .broadcast(one_axis);
+    logits_grad.device(*context.eigen_device()) =
+        (softmax_grad - dot) * softmax;
+  }
+};
 template <typename DeviceContext, typename T, typename Enable>
 void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
    const DeviceContext& context, const int axis_dim,

--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -140,6 +140,23 @@ class CudnnDataType<float16> {
  }
 };
+template <>
+class CudnnDataType<bfloat16> {
+ public:
+  static const miopenDataType_t type = miopenBFloat16;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
 template <>
 class CudnnDataType<float> {
 public:

--- a/paddle/phi/common/amp_type_traits.h
+++ b/paddle/phi/common/amp_type_traits.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+namespace phi {
+namespace dtype {
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+template <>
+class MPTypeTrait<phi::dtype::float16> {
+ public:
+  using Type = float;
+};
+template <>
+class MPTypeTrait<phi::dtype::bfloat16> {
+ public:
+  using Type = float;
+};
+}  // namespace dtype
+}  // namespace phi
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -377,31 +377,31 @@ struct numeric_limits<phi::dtype::bfloat16> {
  static const bool traps = true;
  static const bool tinyness_before = false;
-  static phi::dtype::bfloat16(min)() {
+  HOSTDEVICE static phi::dtype::bfloat16(min)() {
    return phi::dtype::raw_uint16_to_bfloat16(0x007f);
  }
-  static phi::dtype::bfloat16 lowest() {
+  HOSTDEVICE static phi::dtype::bfloat16 lowest() {
    return phi::dtype::raw_uint16_to_bfloat16(0xff7f);
  }
-  static phi::dtype::bfloat16(max)() {
+  HOSTDEVICE static phi::dtype::bfloat16(max)() {
    return phi::dtype::raw_uint16_to_bfloat16(0x7f7f);
  }
-  static phi::dtype::bfloat16 epsilon() {
+  HOSTDEVICE static phi::dtype::bfloat16 epsilon() {
    return phi::dtype::raw_uint16_to_bfloat16(0x3400);
  }
-  static phi::dtype::bfloat16 round_error() {
+  HOSTDEVICE static phi::dtype::bfloat16 round_error() {
    return phi::dtype::bfloat16(0.5);
  }
-  static phi::dtype::bfloat16 infinity() {
+  HOSTDEVICE static phi::dtype::bfloat16 infinity() {
    return phi::dtype::raw_uint16_to_bfloat16(0x7f80);
  }
-  static phi::dtype::bfloat16 quiet_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 quiet_NaN() {
    return phi::dtype::raw_uint16_to_bfloat16(0xffc1);
  }
-  static phi::dtype::bfloat16 signaling_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 signaling_NaN() {
    return phi::dtype::raw_uint16_to_bfloat16(0xff81);
  }
-  static phi::dtype::bfloat16 denorm_min() {
+  HOSTDEVICE static phi::dtype::bfloat16 denorm_min() {
    return phi::dtype::raw_uint16_to_bfloat16(0x0001);
  }
 };

--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -988,18 +988,6 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) {
  return os;
 }
-template <typename T>
-class MPTypeTrait {
- public:
-  using Type = T;
-};
-template <>
-class MPTypeTrait<float16> {
- public:
-  using Type = float;
-};
 }  // namespace dtype
 }  // namespace phi

--- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_grad_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax_grad,
                   phi::SoftmaxGradKernel,
                   float,
                   double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax,
                   phi::SoftmaxRawKernel,
                   float,
                   double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
@@ -47,6 +49,11 @@ class VecT4<phi::dtype::float16> {
 public:
  using Type = int2;
 };
+template <>
+class VecT4<phi::dtype::bfloat16> {
+ public:
+  using Type = int2;
+};
 // Vectorization trait 2 * sizeof(T)
 template <typename T>
@@ -66,6 +73,11 @@ class VecT2<phi::dtype::float16> {
 public:
  using Type = int;
 };
+template <>
+class VecT2<phi::dtype::bfloat16> {
+ public:
+  using Type = int;
+};
 static inline int log2_ceil(int value) {
  int log2_value = 0;

--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
@@ -38,7 +38,18 @@ PD_REGISTER_KERNEL(softmax_grad,
                   ALL_LAYOUT,
                   phi::SoftmaxGradGPUDNNKernel,
                   float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax_grad,
                   GPUDNN,
@@ -48,3 +59,4 @@ PD_REGISTER_KERNEL(softmax_grad,
                   double,
                   phi::dtype::float16) {}
 #endif
+#endif
--- a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
@@ -37,7 +37,18 @@ PD_REGISTER_KERNEL(softmax,
                   ALL_LAYOUT,
                   phi::SoftmaxRawGPUDNNKernel,
                   float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax,
                   GPUDNN,
@@ -47,3 +58,4 @@ PD_REGISTER_KERNEL(softmax,
                   double,
                   phi::dtype::float16) {}
 #endif
+#endif
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,8 +14,9 @@
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 import paddle
+import paddle.fluid.core as core
 import paddle.nn.functional as F
 np.random.seed(10)
@@ -74,6 +75,33 @@ class TestLogSoftmaxAxis(TestLogSoftmaxOp):
        self.axis = 1
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLogSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.dtype = np.uint16
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        x = np.random.uniform(0.1, 1., self.shape).astype(np.float32)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {'axis': self.axis}
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
 class TestNNLogSoftmaxAPI(unittest.TestCase):
    def setUp(self):
        self.x_shape = [2, 3, 4, 5]

--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
@@ -296,6 +296,56 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
        return [2, 3, 4, 5]
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.use_cudnn = self.init_cudnn()
+        self.use_mkldnn = False
+        self.dtype = np.uint16
+        self.shape = [10, 10]
+        self.axis = -1
+        np.random.seed(0)
+        x = np.random.uniform(0.1, 1, self.shape).astype(np.float32)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {
+            'axis': self.axis,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
+        }
+    def init_cudnn(self):
+        return False
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(
+            place, check_dygraph=(self.use_mkldnn == False))
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ["X"],
+            "Out",
+            numeric_grad_delta=0.05,
+            check_dygraph=(self.use_mkldnn == False))
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestSoftmaxBF16CUDNNOp(TestSoftmaxBF16Op):
+    def init_cudnn(self):
+        return True
 class TestSoftmaxAPI(unittest.TestCase):
    def setUp(self):
        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(