From acca0352dd37c64896fbe3ef8a41e08277b950d8 Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Wed, 27 Apr 2022 11:47:54 +0800
Subject: [PATCH] [MLU]add dropout op (#42274)

---
 paddle/fluid/operators/dropout_op_mlu.cc      | 165 +++++++++++
 paddle/fluid/operators/mlu/mlu_baseop.cc      |  94 +++++-
 paddle/fluid/operators/mlu/mlu_baseop.h       |  19 +-
 .../unittests/mlu/test_dropout_op_mlu.py      | 273 ++++++++++++++++++
 4 files changed, 535 insertions(+), 16 deletions(-)
 create mode 100644 paddle/fluid/operators/dropout_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
new file mode 100644
index 0000000000..b88974a51c
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class DropoutMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto dropout_prob = ctx.Attr<float>("dropout_prob");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto* seed_tensor =
+        ctx.HasInput("Seed") ? ctx.Input<Tensor>("Seed") : nullptr;
+    auto dropout_implementation =
+        ctx.Attr<std::string>("dropout_implementation");
+
+    const bool is_upscale = (dropout_implementation == "upscale_in_train");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc out_desc(*out);
+
+    if (!is_test) {
+      // exec dropout op for training only.
+      int seed_data = 0;
+      if (seed_tensor) {
+        if (platform::is_mlu_place(seed_tensor->place())) {
+          memory::Copy(platform::CPUPlace(), &seed_data, seed_tensor->place(),
+                       seed_tensor->data<int>(), sizeof(int));
+        } else {
+          seed_data = *(seed_tensor->data<int>());
+        }
+      } else {
+        seed_data = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
+      }
+
+      auto* mask = ctx.Output<Tensor>("Mask");
+      mask->mutable_data<uint8_t>(ctx.GetPlace());
+      MLUCnnlTensorDesc mask_desc(*mask);
+      // Special case when dropout_prob is 1.0
+      if (dropout_prob == 1.0f) {
+        auto value_t = static_cast<T>(0.0f);
+        MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, out_desc.get(),
+                      GetBasePtr(out));
+        MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, mask_desc.get(),
+                      GetBasePtr(mask));
+        return;
+      }
+
+      // create mlu random generator
+      const int device_id = ctx.GetPlace().GetDeviceId();
+      auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
+
+      const float prob = is_upscale ? dropout_prob : 0.0f;
+      MLUCnnl::FusedDropout(
+          ctx, mlu_gen_random->get(), x_desc.get(), GetBasePtr(x), prob,
+          GetBasePtr(&(mlu_gen_random->get_state())), mask_desc.get(),
+          GetBasePtr(mask), out_desc.get(), GetBasePtr(out));
+    } else {
+      // exec dropout op for inference only.
+      if (is_upscale) {
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(), out);
+      } else {
+        float scale = static_cast<T>(1.0f - dropout_prob);
+        Tensor scale_tensor(x->dtype());
+        scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc scale_desc(scale_tensor);
+        MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &scale, scale_desc.get(),
+                      GetBasePtr(&scale_tensor));
+
+        auto data_type = ToCnnlDataType<T>();
+        MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, data_type,
+                                           CNNL_NOT_PROPAGATE_NAN);
+        MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(),
+                          GetBasePtr(x), scale_desc.get(),
+                          GetBasePtr(&scale_tensor), out_desc.get(),
+                          GetBasePtr(out), data_type);
+      }
+    }
+  }
+};
+
+template <typename T>
+class DropoutGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(!ctx.Attr<bool>("is_test"), true,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
+    auto* grad_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = ctx.Input<Tensor>("Mask");
+    auto dropout_prob = ctx.Attr<float>("dropout_prob");
+    auto dropout_impl = ctx.Attr<std::string>("dropout_implementation");
+
+    grad_x->mutable_data<T>(ctx.GetPlace());
+    MLUCnnlTensorDesc grad_x_desc(*grad_x);
+
+    if (dropout_prob == 1.) {
+      auto value_t = static_cast<T>(0.0f);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, grad_x_desc.get(),
+                    GetBasePtr(grad_x));
+      return;
+    }
+
+    // cast mask from uint8 to float32/float16
+    Tensor cast_mask(grad_x->dtype());
+    cast_mask.Resize(mask->dims());
+    cast_mask.mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc mask_desc(*mask);
+    MLUCnnlTensorDesc cast_mask_desc(cast_mask);
+    cnnlCastDataType_t cast_type =
+        GetCastDataType(framework::TransToProtoVarType(mask->dtype()),
+                        framework::TransToProtoVarType(cast_mask.dtype()));
+
+    MLUCnnl::Cast(ctx, cast_type, mask_desc.get(), GetBasePtr(mask),
+                  cast_mask_desc.get(), GetBasePtr(&cast_mask));
+
+    const bool is_upscale = (dropout_impl == "upscale_in_train");
+    const float scale = is_upscale ? (1.0f / (1.0f - dropout_prob)) : (1.0f);
+
+    auto data_type = ToCnnlDataType<T>();
+    MLUCnnlTensorDesc grad_out_desc(*grad_out);
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, data_type,
+                                       CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), cast_mask_desc.get(),
+                      GetBasePtr(&cast_mask), grad_out_desc.get(),
+                      GetBasePtr(grad_out), grad_x_desc.get(),
+                      GetBasePtr(grad_x), data_type, scale);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(dropout, ops::DropoutMLUKernel<float>,
+                       ops::DropoutMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(dropout_grad, ops::DropoutGradMLUKernel<float>,
+                       ops::DropoutGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 793aa2644b..eacab46800 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -44,6 +44,32 @@ bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type) {
   return false;
 }
 
+const std::shared_ptr<MLUCnnlRandomGeneratorDesc>& GetMLURandomGenerator(
+    const ExecutionContext& ctx, const int64_t device_id, const int seed) {
+  static int64_t num_mlu_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> mlu_device_flags;
+  static std::vector<std::shared_ptr<MLUCnnlRandomGeneratorDesc>>
+      mlu_rand_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_mlu_devices = paddle::platform::GetMLUDeviceCount();
+    mlu_device_flags.resize(num_mlu_devices);
+    mlu_rand_generators.resize(num_mlu_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "mlu device id shoule be greater than 0"));
+  }
+
+  std::call_once(mlu_device_flags[device_id], [&]() {
+    mlu_rand_generators[device_id].reset(
+        new MLUCnnlRandomGeneratorDesc(ctx, seed));
+    VLOG(4) << "device_id: " << device_id << ", initial seed: " << seed;
+  });
+  return mlu_rand_generators[device_id];
+}
+
 class MLUCnnlTensorDescPool {
  public:
   cnnlTensorDescriptor_t Pop() {
@@ -266,23 +292,32 @@ MLUCnnlPoolingDesc::~MLUCnnlPoolingDesc() {
   }
 }
 
-MLUCnnlRandomGeneratorDesc::MLUCnnlRandomGeneratorDesc(const bool is_mlu200,
-                                                       const int seed) {
-  if (is_mlu200) {
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_FAST));
-  } else {
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_MTGP32));
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlRandSetPseudoRandomGeneratorSeed(mlu_generator, seed));
-  }
+MLUCnnlRandomGeneratorDesc::MLUCnnlRandomGeneratorDesc(
+    const ExecutionContext& ctx, const int seed) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_MTGP32));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRandSetPseudoRandomGeneratorSeed(mlu_generator, seed));
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRandGetMTGP32StateSize(mlu_generator, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  mlu_state = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* mlu_state_ptr = mlu_state.mutable_data(ctx.GetPlace());
+
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandMakeMTGP32KernelState(
+      handle, mlu_state_ptr, nullptr, nullptr, seed));
 }
 
 const cnnlRandGenerator_t MLUCnnlRandomGeneratorDesc::get() const {
   return mlu_generator;
 }
 
+Tensor& MLUCnnlRandomGeneratorDesc::get_state() { return mlu_state; }
+
 MLUCnnlRandomGeneratorDesc::~MLUCnnlRandomGeneratorDesc() {
   if (mlu_generator) {
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandDestroyGenerator(mlu_generator));
@@ -947,6 +982,26 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       workspace_ptr, workspace_size, beta_ptr, output_desc, output));
 }
 
+/* static */ void MLUCnnl::MulAx(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t alpha_desc,
+                                 const void* alpha,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetAxWorkspaceSize(handle, alpha_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlAx_v2(handle, alpha_desc, alpha, output_desc,
+                                       output, workspace_ptr, workspace_size));
+}
+
 /* static */ void MLUCnnl::BiasAddGrad(
     const ExecutionContext& ctx, const int axis,
     const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
@@ -959,12 +1014,23 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
 
 /* static */ void MLUCnnl::RandomUniform(
     const ExecutionContext& ctx, const int num, const cnnlDataType_t data_type,
-    const cnnlRandGenerator_t mlu_generator, const float min, const float max,
-    void* output) {
+    const cnnlRandGenerator_t mlu_generator, void* mlu_state, void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandGenerateUniform(
-      handle, mlu_generator, data_type, nullptr, num, min, max, output));
+      handle, mlu_generator, data_type, mlu_state, num, 0, 1, output));
+}
+
+/* static */ void MLUCnnl::FusedDropout(
+    const ExecutionContext& ctx, const cnnlRandGenerator_t generator,
+    const cnnlTensorDescriptor_t input_desc, const void* input, const float p,
+    void* state, const cnnlTensorDescriptor_t mask_desc, const void* mask,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlFusedDropout_v2(handle, generator, input_desc,
+                                                 input, p, state, mask_desc,
+                                                 mask, output_desc, output));
 }
 
 /* static */ void MLUCnnl::TopK(
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 9948c45e24..572b7aa2bb 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -273,14 +273,19 @@ class MLUCnnlPoolingDesc {
 
 class MLUCnnlRandomGeneratorDesc {
  public:
-  MLUCnnlRandomGeneratorDesc(const bool is_mlu200, const int seed);
+  MLUCnnlRandomGeneratorDesc(const ExecutionContext& ctx, const int seed);
   const cnnlRandGenerator_t get() const;
+  Tensor& get_state();
   ~MLUCnnlRandomGeneratorDesc();
 
  private:
+  Tensor mlu_state;
   cnnlRandGenerator_t mlu_generator = nullptr;
 };
 
+const std::shared_ptr<MLUCnnlRandomGeneratorDesc>& GetMLURandomGenerator(
+    const ExecutionContext& ctx, const int64_t device_id, const int seed);
+
 class MLUCnnlReduceDesc {
  public:
   MLUCnnlReduceDesc(const MLUCnnlReduceDesc& desc) = delete;
@@ -537,7 +542,13 @@ class MLUCnnl {
   static void RandomUniform(const ExecutionContext& ctx, const int num,
                             const cnnlDataType_t data_type,
                             const cnnlRandGenerator_t mlu_generator,
-                            const float min, const float max, void* output);
+                            void* mlu_state, void* output);
+
+  static void FusedDropout(
+      const ExecutionContext& ctx, const cnnlRandGenerator_t generator,
+      const cnnlTensorDescriptor_t input_desc, const void* input, const float p,
+      void* state, const cnnlTensorDescriptor_t mask_desc, const void* mask,
+      const cnnlTensorDescriptor_t output_desc, void* output);
 
   static void Cumsum(const ExecutionContext& ctx, const int axis,
                      const bool exclusive, const bool reverse,
@@ -709,6 +720,10 @@ class MLUCnnl {
       const void* in0, const cnnlTensorDescriptor_t in1_desc, const void* in1,
       const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void MulAx(const ExecutionContext& ctx,
+                    const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void OpTensor(const ExecutionContext& ctx,
                        const cnnlOpTensorDescriptor_t op_tensor_desc,
                        const cnnlTensorDescriptor_t a_desc, const void* a,
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
new file mode 100644
index 0000000000..f8984f5c6d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
@@ -0,0 +1,273 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2022
+
+
+class TestDropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64)).astype('uint8')
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestDropoutOpInput1d(TestDropoutOp):
+    # change input shape
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((3, 62)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((3, 62)).astype('uint8')
+        }
+
+
+class TestDropoutOpInput1d_1(TestDropoutOp):
+    # the input is 1-D
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((2000)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((2000)).astype('uint8')
+        }
+
+
+class TestDropoutOp2(TestDropoutOp):
+    # the dropout_prob is 1.0
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 1.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': np.zeros((32, 64)).astype('float32'),
+            'Mask': np.zeros((32, 64)).astype('uint8')
+        }
+
+
+class TestDropoutOp3(TestDropoutOp):
+    # the input dim is 3
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64, 2)).astype('uint8')
+        }
+
+
+@skip_check_grad_ci(reason="For inference, check_grad is not required.")
+class TestDropoutOpInference(OpTest):
+    # is_test = True
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.35,
+            'fix_seed': True,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+@skip_check_grad_ci(reason="For inference, check_grad is not required.")
+class TestDropoutOpInference2(TestDropoutOpInference):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64, 3)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.75,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+
+class TestDropoutOpWithSeed(TestDropoutOp):
+    # the seed is a Tensor
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {
+            "X": np.random.random((32, 64)).astype(self.dtype),
+            "Seed": np.asarray(
+                [125], dtype="int32")
+        }
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64)).astype('uint8')
+        }
+
+
+class TestDropoutOpFp16(TestDropoutOp):
+    # float16
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+
+class TestDropoutAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace(), paddle.device.MLUPlace(0)]
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.dropout(
+                x=input, p=0., training=False, mode='upscale_in_train')
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
+            res4 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train')
+            res6 = paddle.nn.functional.dropout(
+                x=input, p=1., training=True, mode='upscale_in_train')
+            res7 = paddle.fluid.layers.dropout(
+                x=input,
+                dropout_prob=0.,
+                dropout_implementation='upscale_in_train')
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=(0, 1),
+                training=False,
+                mode='upscale_in_train')
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+            res_np2 = np.zeros_like(in_np)
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2, res3, res4, res5, res7, res8]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches2 = exe.run(fluid.default_main_program(),
+                               feed={"input": in_np},
+                               fetch_list=[res6])
+            self.assertTrue(np.allclose(fetches2[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab