diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 851bd81403a85e52fbbb3c4c8bf0da1df63c8848..cafd1406b256f87197172e0519131b39cb556c13 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -258,6 +258,12 @@ copy(inference_lib_dist
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+        copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 9c4089af092e418d6845864671124917c6498cf1..10696dbacd35bb2b4895de5a1efa0a8a249fc508 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -39,8 +39,9 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
 }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
-operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
-           bool create_graph) {
+operator()(
+    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
+    bool create_graph) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index a91a0b6e34c0d9440e3645d1a6982748c4315962..2e38d7e9e91e227681ec8dfbc252ca59465ccf09 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -35,7 +35,7 @@ class GradNodeAccumulation : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
       bool create_graph = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 0bc998a03a80b7b8a1e486ad68f1575c130d2c1b..d9f5447a88e9bf2c8f95c55911066b76078ea25f 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -145,8 +145,9 @@ void GradNodeScale::SetTensorWrappers_X(
 void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
-operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
-           bool create_graph) {
+operator()(
+    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
+    bool create_graph) {
   // 1. Check Output Size
   PADDLE_ENFORCE(
       ((grads.size() == 1) && (grads[0].size() == 1)),
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index e263f73a6b8a4a1f9ce23d9b5ca383fd6828016b..0b942d2a06707817e58af6d42441c7fa7620f7a0 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -39,7 +39,7 @@ class GradNodeScale : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
       bool create_graph = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index df2cdc35626a8aa27899f7340fa14285299a11d1..229817596423cd46c29db3f0dae589d0655b8485 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
+static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
+    "split"};
+
 /* --- Black Ops list that's NO NEED to apply code generation --- */
 static std::unordered_set<std::string> black_ops_list = {"run_program"};
 
@@ -2243,11 +2246,21 @@ static std::string GenerateGradNodeCCContents(
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
       "std::vector<std::vector<paddle::experimental::Tensor>> "
-      "GradNode%s::operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
-      "bool create_graph) {\n%s\n}";
-  std::string grad_function_str = paddle::string::Sprintf(
-      GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
+      "GradNode%s::operator()("
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
+      "create_graph) {\n"
+      "%s"
+      "%s"
+      "\n}";
+  std::string fill_zero_str = "";
+  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
+    fill_zero_str =
+        "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, "
+        "this->InputMeta());\n";
+  }
+  std::string grad_function_str =
+      paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
+                              fill_zero_str, generated_grad_function_body);
 
   VLOG(6) << "Generated returns";
 
@@ -2279,9 +2292,9 @@ static std::string GenerateGradNodeHeaderContents(
       "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
-      "operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
-      "bool create_graph = false) "
+      "operator()("
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
+      "create_graph = false) "
       "override;\n"
       "\n"
       "  void ClearTensorWrappers() override { \n"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 1c33d1c2f4f0b52d21584e099c5bf3a0dabd5f6e..6736a281a821f597461744abd01e7ec37b3d9876 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -17,6 +17,8 @@ import re
 import argparse
 import os
 
+ops_to_fill_zero_for_empty_grads = set(list("split"))
+
 # For API dispatch used at python-level
 # { op_name : [arg_name, ...] }
 core_ops_returns_info = {}
@@ -599,7 +601,8 @@ class {} : public egr::GradNodeBase {{
   ~{}() override = default;
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
+  
   std::string name() override {{ return \" {} \"; }}
   
   void ClearTensorWrappers() override {{
@@ -657,10 +660,11 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     for _, (ttype, fwd_position,
             grad_api_position) in backward_grad_input_map.items():
         if IsPlainTensorType(ttype):
-            grad_api_args[grad_api_position] = f"grads[{fwd_position}][0]"
+            grad_api_args[
+                grad_api_position] = f"hooked_grads[{fwd_position}][0]"
         else:
             assert IsVectorTensorType(ttype)
-            grad_api_args[grad_api_position] = f"grads[{fwd_position}]"
+            grad_api_args[grad_api_position] = f"hooked_grads[{fwd_position}]"
 
     for name, _, _, grad_api_position in backward_attrs_list:
         saved_attribute_name = GetSavedName(name)
@@ -688,23 +692,30 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
 
     grad_node_name = GetGradNodeName(fwd_api_name)
 
+    fill_zero_str = ""
+    if fwd_api_name in ops_to_fill_zero_for_empty_grads:
+        fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n"
+
     if len(namespace) > 0:
         grad_api_namespace = f"paddle::experimental::{namespace}"
     else:
         grad_api_namespace = f"paddle::experimental"
 
     FUNCTION_TEMPLATE = """
-std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
+std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
+    {}
+    auto hooked_grads = ApplyGradientHooks(grads);
+    
     // Call grad_api function
-    VLOG(3) << \"Finally State Running: \" << \"{}\"; 
+    VLOG(3) << \"Final State Running: \" << \"{}\"; 
     auto grad_api_returns = {}::{}({});
     {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        grad_node_name, grad_node_name, grad_api_namespace, bwd_api_name,
-        grad_api_args_str, returns_str)
+        grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
+        bwd_api_name, grad_api_args_str, returns_str)
 
     return node_definition_str
 
@@ -799,8 +810,15 @@ def GenerateNodeCreationCodes(
 
     # SetAttributes
     set_attributes_list = []
-    for name, _, _, _ in backward_attrs_list:
-        set_attributes = f"            grad_node->SetAttribute{name}({name});"
+    forward_attrs_name_set = set()
+    for name, _, _, _ in forward_attrs_list:
+        forward_attrs_name_set.add(name)
+
+    for name, _, default_val_attr, _ in backward_attrs_list:
+        if name in forward_attrs_name_set:
+            set_attributes = f"        grad_node->SetAttribute{name}({name});"
+        else:
+            set_attributes = f"        grad_node->SetAttribute{name}({default_val_attr});"
         set_attributes_list.append(set_attributes)
     set_attributes_str = "\n".join(set_attributes_list)
 
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 72af1cc4b068679e72ae6bdc5e09fab8f56bac04..08ca3bed5a653637b77ba512a812a0b863fc80c7 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -20,8 +20,8 @@
 
 namespace egr {
 std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
-operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
-           bool create_graph) {
+operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {  // NOLINT
   paddle::CustomOpKernelContext ctx;
   auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
       egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index 6ece2658575c795856438904c2716d61f0985879..33b56fc8c863ac23dc29f2468198f2610beef164 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -37,8 +37,9 @@ class RunCustomOpNode : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
-      bool create_graph) override;
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false)  // NOLINT
+      override;
 
   std::string name() {
     return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 1d44d842b0825aa96380c947c67082fbcb5e1642..25610a3f95fe5d969ffafa8379842b1ef2333b54 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -102,6 +102,7 @@ const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
 
 void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
                                  size_t slot_rank) {
+  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
   auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
@@ -117,6 +118,12 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
   auto& meta = metas[0];
   meta.SetStopGradient(fwd_out_meta->StopGradient());
 
+  if (!fwd_out.is_initialized()) {
+    VLOG(6)
+        << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
+    return;
+  }
+
   // Record TensorMeta
   if (phi::DenseTensor::classof(fwd_out.impl().get())) {
     // Only Copy Meta
@@ -128,7 +135,9 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
         paddle::platform::errors::Fatal(
             "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED,"
             "which is illegal."));
+
     meta.SetTensorMeta(dense_tensor->meta());
+    meta.SetPlace(fwd_out.inner_place());
 
     if (paddle::framework::IsComplexType(
             paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
@@ -143,6 +152,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
 void GradNodeBase::SetGradInMeta(
     const std::vector<paddle::experimental::Tensor>& fwd_out,
     size_t slot_rank) {
+  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
   size_t slot_size = fwd_out.size();
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
@@ -172,6 +182,12 @@ void GradNodeBase::SetGradInMeta(
       meta.SetStopGradient(fwd_out_meta->StopGradient());
     }
 
+    if (!fwd_out_tensor.is_initialized()) {
+      VLOG(6)
+          << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
+      return;
+    }
+
     // Record TensorMeta
     if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) {
       // Only Copy Meta
@@ -184,6 +200,8 @@ void GradNodeBase::SetGradInMeta(
                                           "with phi::DataType::UNDEFINED,"
                                           "which is illegal."));
       meta.SetTensorMeta(dense_tensor->meta());
+      meta.SetPlace(fwd_out_tensor.inner_place());
+
       if (paddle::framework::IsComplexType(
               paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
         need_complex_to_real_ = true;
@@ -228,6 +246,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
                                           "with phi::DataType::UNDEFINED,"
                                           "which is illegal."));
       meta.SetTensorMeta(dense_tensor->meta());
+      meta.SetPlace(fwd_in.inner_place());
     }
   } else {
     VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
@@ -272,6 +291,7 @@ void GradNodeBase::SetGradOutMeta(
                               "phi::DataType::UNDEFINED,"
                               "which is illegal."));
         meta.SetTensorMeta(dense_tensor->meta());
+        meta.SetPlace(fwd_in_tensor.inner_place());
       }
     } else {
       VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 28c12717a24b0c89b8a3b6544124ad6533d6c70d..4dec1c1f9f4e5c0088fc05a8e581bb637117b4a4 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -76,8 +76,12 @@ class GradSlotMeta {
     return *meta_.get();
   }
 
+  void SetPlace(const phi::Place& place) { place_ = place; }
+  const phi::Place& GetPlace() const { return place_; }
+
  private:
   bool stop_gradient_{false};
+  phi::Place place_;
   std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
 };
 
@@ -102,7 +106,7 @@ class GradNodeBase {
    * is better choice to fit this format.
    * **/
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
       bool create_graph = false) = 0;
 
   virtual void ClearTensorWrappers() = 0;
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index 8c00f9161b629f7a3f093a1225d3d5b0b9bcca8b..db03789ea7632b0baf6e72e6da7624c04284d316 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -53,7 +53,7 @@ class GradTensorHolder {
     return buffer_[pos];
   }
 
-  const std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
+  std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
     return buffer_;
   }
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index 28682ab0fe094df6d27eb27e9118e6576685c95a..6c6c7fd25e5e53c7bd96d649fcab8b694b697d7b 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -80,13 +80,15 @@ TEST(AccumulationNode, Tensor) {
   grad_meta->SetStopGradient(false);
 
   // operator()
-  paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0];
+  std::vector<std::vector<paddle::experimental::Tensor>> et0_vec = {{et0}};
+  paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0];
   auto* ret_et0_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
           ->data<paddle::platform::float16>();
   CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
 
-  paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0];
+  std::vector<std::vector<paddle::experimental::Tensor>> et1_vec = {{et1}};
+  paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0];
 
   auto* ret_et1_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
@@ -121,7 +123,7 @@ TEST(AccumulationNode, Tensor) {
       std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1));
 
   // operator()
-  paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];
+  paddle::experimental::Tensor _ret = node->operator()(et0_vec)[0][0];
 
   // Check operator() result, should be 36.0
   auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 0b167203735d65683b0f978fa34fe7f457aae4f2..dff12fdfc34a1330cf7be300a99ee95c0086f668 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -32,7 +32,7 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
   std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,
       bool create_graph = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 217055e4e9e4a19e695f42bf57c2331c9b98e2bd..7486e711641fc9ae4a02d8e66dbcd1099c548abf 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -247,4 +247,20 @@ TEST(EagerUtils, GetGradAccumulationNode) {
   ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
 }
 
+TEST(EagerUtils, FillZeroForEmptyGradInputs) {
+  std::vector<std::vector<paddle::experimental::Tensor>> grads = {
+      std::vector<paddle::experimental::Tensor>(1)};
+  std::vector<std::vector<GradSlotMeta>> slot_metas = {
+      std::vector<GradSlotMeta>(1)};
+
+  phi::DenseTensorMeta tensor_meta;
+  tensor_meta.dtype = paddle::experimental::DataType::FLOAT32;
+  tensor_meta.dims = {2, 4};
+  slot_metas[0][0].SetTensorMeta(tensor_meta);
+  slot_metas[0][0].SetPlace(phi::CPUPlace());
+
+  EagerUtils::FillZeroForEmptyGradInputs(&grads, slot_metas);
+  eager_test::CompareTensorWithValue<float>(grads[0][0], 0.0);
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 4eaa64d3ac659ca0ec76083b70855d8b6b241556..c83e16e9a1ec21b3e7303834ac35b55fed60b2a6 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -370,7 +370,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
   ~GradNodeRunProgram() override = default;
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
+      std::vector<std::vector<paddle::experimental::Tensor>> &grads,  // NOLINT
       bool create_graph) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 048087903a47c1699a7d7f32199c313146bd37ab..20faae95281db87ad4896b19e63857cf4b7e5e02 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -20,6 +20,7 @@
 
 #include "paddle/phi/api/all.h"
 #include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_meta.h"
 
 #include "paddle/fluid/framework/data_layout.h"
@@ -392,4 +393,28 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
   }
 }
 
+void EagerUtils::FillZeroForEmptyGradInputs(
+    std::vector<std::vector<paddle::experimental::Tensor>>* in_grads,
+    const std::vector<std::vector<GradSlotMeta>>& grad_in_metas) {
+  for (size_t i = 0; i < in_grads->size(); i++) {
+    for (size_t j = 0; j < (*in_grads)[0].size(); j++) {
+      paddle::experimental::Tensor& grad = (*in_grads)[i][j];
+      if (!grad.is_initialized()) {
+        const GradSlotMeta& grad_in_meta = grad_in_metas[i][j];
+        PADDLE_ENFORCE(
+            grad_in_meta.HasTensorMeta(),
+            paddle::platform::errors::Fatal(
+                "Unable to fill empty grad inputs due to empty GradSlotMeta"));
+
+        const auto& tensor_meta = grad_in_meta.GetTensorMeta();
+        phi::Place place = grad_in_meta.GetPlace();
+
+        auto tensor_with_zero = paddle::experimental::full(
+            phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype, place);
+        grad.set_impl(tensor_with_zero.impl());
+      }
+    }
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index fbd080ef70e25408abcb979360610ad08d752f96..396837f101c6518d32a908a80ff4782bf45ea090 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -217,6 +217,13 @@ class EagerUtils {
       const std::vector<paddle::experimental::Tensor>& tensors);
   static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
       const paddle::experimental::Tensor& tensor);
+
+  /**
+    * Fill Zero
+    * **/
+  static void FillZeroForEmptyGradInputs(
+      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads,
+      const std::vector<std::vector<GradSlotMeta>>& grad_out_metas);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index d18c8e96c49b6a993fbd0a8d632212ae8d7f8c6d..3d4cfa2df31798e476dbd96945b69d781edfa421 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -176,6 +176,20 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      const std::map<std::string, std::string>& inplace_map,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
+  TraceOpImpl<VarType>(type, ins, outs, attrs, place, trace_backward,
+                       inplace_map, passed_default_attrs_,
+                       use_default_attr_map);
+}
+
+template <typename VarType>
+void Tracer::TraceOpImpl(const std::string& type,
+                         const NameVarMap<VarType>& ins,
+                         const NameVarMap<VarType>& outs,
+                         framework::AttributeMap& attrs,
+                         const platform::Place& place, bool trace_backward,
+                         const std::map<std::string, std::string>& inplace_map,
+                         paddle::framework::AttributeMap* passed_default_attrs_,
+                         bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
       type + " trace_op", platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
@@ -340,25 +354,33 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
 
 void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
                      const NameTensorMap& outs,
-                     paddle::framework::AttributeMap attrs,
+                     paddle::framework::AttributeMap& attrs,
                      const paddle::platform::Place& place,
                      paddle::framework::AttributeMap* default_attrs,
                      bool use_default_attr_map,
                      const std::map<std::string, std::string>& inplace_map) {
   VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: "
           << use_default_attr_map;
-  TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs), place, false,
-                              inplace_map, default_attrs, use_default_attr_map);
+  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, place, false,
+                                  inplace_map, default_attrs,
+                                  use_default_attr_map);
+}
+
+void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
+                     const NameTensorMap& outs,
+                     paddle::framework::AttributeMap attrs) {
+  VLOG(6) << "Running On Eager TraceOp(4 agrs): ";
+  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
+                                  false, {}, nullptr, true);
 }
 
 void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
                      const NameTensorMap& outs,
-                     paddle::framework::AttributeMap attrs,
+                     paddle::framework::AttributeMap& attrs,
                      const std::map<std::string, std::string>& inplace_map) {
   VLOG(6) << "Running On Eager TraceOp(less): ";
-  TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs),
-                              expected_place_, false, inplace_map, nullptr,
-                              true);
+  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
+                                  false, inplace_map, nullptr, true);
 }
 
 void Tracer::SetExpectedPlace(platform::Place place) {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index f24961885c9b85b03c561f60f375b1a21bf086dd..4e671d52457e203b7d64420490d9420db013b673 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -74,16 +74,32 @@ class Tracer {
                paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
                bool use_default_attr_map = true);
 
+  template <typename VarType>
+  void TraceOpImpl(
+      const std::string& type, const NameVarMap<VarType>& ins,
+      const NameVarMap<VarType>& outs,
+      framework::AttributeMap& attrs,  // NOLINT
+      const platform::Place& place, bool trace_backward,
+      const std::map<std::string, std::string>& inplace_map = {},
+      paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
+      bool use_default_attr_map = true);
+
   void TraceOp(const std::string& type, const NameVarBaseMap& ins,
                const NameVarBaseMap& outs, framework::AttributeMap attrs,
                const std::map<std::string, std::string>& inplace_map = {});
 
   void TraceOp(const std::string& type, const NameTensorMap& ins,
-               const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
+               const NameTensorMap& outs,
+               paddle::framework::AttributeMap& attrs,  // NOLINT
                const std::map<std::string, std::string>& inplace_map = {});
 
   void TraceOp(const std::string& type, const NameTensorMap& ins,
-               const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
+               const NameTensorMap& outs,
+               paddle::framework::AttributeMap attrs);
+
+  void TraceOp(const std::string& type, const NameTensorMap& ins,
+               const NameTensorMap& outs,
+               paddle::framework::AttributeMap& attrs,  // NOLINT
                const paddle::platform::Place& place,
                paddle::framework::AttributeMap* default_attrs,
                bool use_default_attr_map,
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4a44448dc84cf744cdf061031bdf7fae8f658c4b..abf72564753367bb7ea4f537ed5952753329de07 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -34,6 +34,7 @@
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
@@ -210,13 +211,28 @@ class AllocatorFacadePrivate {
         InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
-        if (!FLAGS_use_stream_safe_cuda_allocator) {
-          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
-               ++dev_id) {
-            InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
-                                        allow_free_idle_chunk_);
-          }
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
+          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
+                                      allow_free_idle_chunk_);
+        }
+
+        // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
+        // -> Allocator) hold the StreamSafeCUDAAllocator releate to default
+        // stream (i.e., the stream directly got from DeviceContex), while the
+        // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
+        // StreamSafeCUDAAllocator releate to non-default stream (i.e., the
+        // stream users pass in). The default stream Allocator is built in the
+        // structure of AllocatorFacadePrivate, while the non-default stream is
+        // build in a delayed manner in GetAllocator function with
+        // 'create_if_not_found = ture'. We make special treatment for the
+        // default stream for performance reasons. Since most Alloc calls are
+        // for default stream in application, treating it separately can avoid
+        // lots of overhead of acquiring default stream and applying read-write
+        // lock.
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          WrapStreamSafeCUDAAllocatorForDefault();
         }
+
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -301,7 +317,8 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
 
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    if (FLAGS_use_stream_safe_cuda_allocator == false &&
+        UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
       WrapCUDAGraphAllocator();
     }
 #endif
@@ -341,7 +358,12 @@ class AllocatorFacadePrivate {
   const std::shared_ptr<Allocator>& GetAllocator(
       const platform::CUDAPlace& place, const gpuStream_t& stream,
       bool create_if_not_found = false) {
-    {  // shared_lock_guard
+    if (stream == GetDefaultStream(place)) {
+      VLOG(7) << "Get Allocator by passing in a default stream";
+      return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+    }
+
+    /* shared_lock_guard */ {
       std::shared_lock<std::shared_timed_mutex> lock_guard(
           cuda_allocator_mutex_);
       if (LIKELY(HasCUDAAllocator(place, stream))) {
@@ -355,7 +377,7 @@ class AllocatorFacadePrivate {
       }
     }
 
-    {  // unique_lock_guard
+    /* unique_lock_guard */ {
       std::unique_lock<std::shared_timed_mutex> lock_guard(
           cuda_allocator_mutex_);
       InitStreamSafeCUDAAllocator(place, stream);
@@ -363,9 +385,40 @@ class AllocatorFacadePrivate {
     }
   }
 
-  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
+  const std::shared_ptr<StreamSafeCUDAAllocator>
+  GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
+    const auto iter = default_stream_safe_cuda_allocators_.find(place);
+    PADDLE_ENFORCE_NE(
+        iter, default_stream_safe_cuda_allocators_.end(),
+        platform::errors::NotFound(
+            "No StreamSafeCUDAAllocator found for the place, %s", place));
+    return iter->second;
+  }
+
+  const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
+    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
+        GetDefaultStreamSafeCUDAAllocator(place);
+    return allocator->GetDefaultStream();
+  }
+
+  void SetDefaultStream(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream) {
+    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
+        GetDefaultStreamSafeCUDAAllocator(place);
+    allocator->SetDefaultStream(stream);
+    VLOG(8) << "Set default stream to " << stream
+            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
+            << place;
+  }
+
+  void SetDefaultStreamFromDeviceContext() {
+    VLOG(8) << "Set default stream from DeviceContex";
+    for (auto& pair : default_stream_safe_cuda_allocators_) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pair.second->SetDefaultStream(
+          static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
+    }
   }
 
   void RecordStream(std::shared_ptr<phi::Allocation> allocation,
@@ -635,6 +688,26 @@ class AllocatorFacadePrivate {
         /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
   }
 
+  void WrapStreamSafeCUDAAllocatorForDefault() {
+    for (auto& pair : allocators_) {
+      auto& place = pair.first;
+      if (platform::is_gpu_place(place)) {
+        std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
+            std::make_shared<StreamSafeCUDAAllocator>(
+                pair.second, place, /* default_stream = */ nullptr,
+                /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
+        pair.second = allocator;
+
+        // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
+        // ability to interact with the outside world, i.e., change default
+        // stream from outside
+        default_stream_safe_cuda_allocators_[place] = allocator;
+        VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
+                << ", allocator address = " << pair.second.get();
+      }
+    }
+  }
+
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
                               size_t retry_time) {
     PADDLE_ENFORCE_GT(
@@ -813,7 +886,6 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  // NOTE(Ruibiao): Old single-stream version, will be removed later
   void WrapCUDARetryAllocator(size_t retry_time) {
     PADDLE_ENFORCE_GT(
         retry_time, 0,
@@ -828,6 +900,8 @@ class AllocatorFacadePrivate {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
+  std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
+      default_stream_safe_cuda_allocators_;
   CUDAAllocatorMap cuda_allocators_;
   std::shared_timed_mutex cuda_allocator_mutex_;
 #endif
@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    AllocatorFacadePrivate* m = GetPrivate();
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
-  }
-#endif
-
   return GetPrivate()->GetAllocator(
       place, /* A non-zero num to choose allocator_ */ 1);
 }
@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr(
   return GetPrivate()->GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
-    const platform::Place& place, const gpuStream_t& stream) {
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    return GetPrivate()->GetAllocator(place, stream,
-                                      /*create_if_not_found=*/true);
-  }
-  return GetPrivate()->GetAllocator(
-      place, /* A non-zero num to choose allocator_ */ 1);
-}
-#endif
-
 const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
     const platform::Place& place) {
   return GetPrivate()->GetAllocator(place, /* zero size */ 0);
@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      size > 0 && FLAGS_use_system_allocator == false) {
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
-        GetPrivate()->GetDefaultStream(cuda_place)));
-    return Alloc(cuda_place, size, default_stream);
-  }
-#endif
   return GetPrivate()->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
-  }
-#endif
   return GetPrivate()
       ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
   GetPrivate()->RecordStream(allocation, stream);
 }
 
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place, const gpuStream_t& stream) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+    return GetPrivate()->GetAllocator(place, stream,
+                                      /*create_if_not_found=*/true);
+  }
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
+}
+
 const gpuStream_t& AllocatorFacade::GetStream(
     const std::shared_ptr<phi::Allocation>& allocation) const {
   PADDLE_ENFORCE_EQ(
@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream(
   return GetPrivate()->GetStream(allocation);
 }
 
+void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
+                                       const gpuStream_t& stream) {
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    GetPrivate()->SetDefaultStream(place, stream);
+  }
+}
+
 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
   PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
           "The memory pool of the CUDA Graph with ID %d have been prepared.",
           id));
   allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+  allocator->SetDefaultStreamFromDeviceContext();
+
   VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 9066bb284e28af197111b5d3ea129cc65b5fe914..1ea872f7ecaf4b411d828fbc44cf8e0be320aa6d 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -55,11 +55,6 @@ class AllocatorFacade {
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
-                                                 const gpuStream_t& stream);
-#endif
-
   const std::shared_ptr<Allocator>& GetZeroAllocator(
       const platform::Place& place);
 
@@ -86,8 +81,12 @@ class AllocatorFacade {
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
+                                                 const gpuStream_t& stream);
   const gpuStream_t& GetStream(
       const std::shared_ptr<Allocation>& allocation) const;
+  void SetDefaultStream(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream);
 #endif
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 072c4dee3bc45b4ff5f23f5288d3412a14f63b0f..7e47d35176bac520ce00616d513f23b08c80beb4 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
+const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const {
+  return default_stream_;
+}
+
+void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) {
+  default_stream_ = stream;
+}
+
 phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
                         platform::TracerEventType::UserDefined, 9 /*level*/);
@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
   platform::RecordEvent("StreamSafeCUDAAllocator::Free",
                         platform::TracerEventType::UserDefined, 9 /*level*/);
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
-      dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
-  PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
-                          platform::errors::InvalidArgument(
-                              "Failed to dynamic cast %p from Allocation* to "
-                              "StreamSafeCUDAAllocation*",
-                              allocation));
+      static_cast<StreamSafeCUDAAllocation*>(allocation);
+
   VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
   if (stream_safe_cuda_allocation->CanBeFreed()) {
     VLOG(9) << "Directly delete allocation";
@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
 }
 
 void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
+  // NOTE(Ruibiao): This condition is to reduce lock competion. It does not need
+  // to be thread-safe since here occasional misjudgments are permissible.
+  if (unfreed_allocations_.empty()) {
+    return;
+  }
+
   std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
   for (auto it = unfreed_allocations_.begin();
        it != unfreed_allocations_.end();) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index ecddff97c206be968148e32ddf3f9c6623bf8bde..65af32c701b756c981f9576bdb06db8a0e53809a 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator
                           platform::CUDAPlace place, gpuStream_t default_stream,
                           bool in_cuda_graph_capturing = false);
   ~StreamSafeCUDAAllocator();
+
   bool IsAllocThreadSafe() const override;
+  const gpuStream_t &GetDefaultStream() const;
+  void SetDefaultStream(const gpuStream_t &stream);
 
  protected:
   phi::Allocation *AllocateImpl(size_t size) override;
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
index edf854a9c95b088225ac0eb225f056f0c531c393..8139530b809abf17c7c0f65662afaa5fdabd767f 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -24,7 +24,9 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
+    // The cinn-graph may hasn't input for CINN now support fill_constant,
+    // and its all inputs may generated by fill_constant instead of by fetch.
+    // OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
     OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
                    "CinnInstructionRun");
     const CinnCompiledObject& compiled_object =
@@ -43,6 +45,53 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
                    });
     ctx->SetOutputsDim(kOutputs, output_dims);
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // Why we need override GetExpectedKernelType?
+    // A cinn-graph may has no inpute var, if we use the base function,
+    // it will check wheter input tensors is initialized. Here we rewrite
+    // the function so that we can infer kernel type by output date type.
+    if (ctx.InputSize(kX)) {
+      // if the instruction has input, infer kernel type by input date type:
+      return OperatorWithKernel::GetExpectedKernelType(ctx);
+    }
+
+    // Else infer kernel type by output date type:
+    // The `OutputVar` will check wheter the kOutputs iff has one output var
+    const framework::Variable* var = ctx.OutputVar(kOutputs);
+    PADDLE_ENFORCE_NE(
+        var, nullptr,
+        platform::errors::InvalidArgument(
+            "The cinn_instruction_run Op's Output Variable should not empty."));
+
+    const framework::Tensor* tensor = nullptr;
+    if (var->IsType<framework::Tensor>()) {
+      tensor = &var->Get<framework::Tensor>();
+    } else if (var->IsType<framework::LoDTensor>()) {
+      tensor = &var->Get<framework::LoDTensor>();
+    } else if (var->IsType<phi::SelectedRows>()) {
+      tensor = &(var->Get<phi::SelectedRows>().value());
+    } else if (var->IsType<framework::LoDTensorArray>()) {
+      auto t_arr = &var->Get<framework::LoDTensorArray>();
+      PADDLE_ENFORCE_EQ(t_arr->size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The cinn_instruction_run Op should just has One "
+                            "Output when Input empty."));
+      tensor = &(t_arr->front());
+    }
+
+    PADDLE_ENFORCE_NE(
+        tensor, nullptr,
+        platform::errors::InvalidArgument(
+            "The cinn_instruction_run Op's Output Tensor should not empty."));
+
+    VLOG(4) << "The tensor [" << ctx.OutputName(kOutputs) << "]'s dtype is "
+            << paddle::framework::DataType2String(tensor->dtype());
+    auto output_type = paddle::framework::TransToProtoVarType(tensor->dtype());
+    return framework::OpKernelType(output_type, ctx.device_context());
+  }
 };
 
 class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index d918b7216c4d2f1e8cd0891d3a0dc0a5d2ed4339..5d006a947be1997b003c0dc717c1bec9ed136e7f 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -87,9 +87,12 @@ class CinnLaunchOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
-                   "Input", string::format_string("%s|%s", kX, kNoNeedBufferX),
-                   "CinnLaunchOp");
+    // The cinn-graph may hasn't input for CINN now support fill_constant,
+    // and its all inputs may generated by fill_constant instead of by fetch.
+    // OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
+    //                "Input", string::format_string("%s|%s", kX,
+    //                kNoNeedBufferX),
+    //                "CinnLaunchOp");
     OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
                    "CinnLaunchOp");
   }
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 144198367d538e178a745c22902bb77a65f45fe4..94db4c62e391229416ea6b3763177c56b65f4252 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -35,143 +35,99 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/functors.h"
-
 namespace paddle {
 namespace operators {
+template <typename T1, typename T2 = T1, typename OutT = T1>
+struct DstMaskGenerator {
+  const float dropout_prob_;
+  const bool is_upscale_in_train_;
+  using MT = typename details::MPTypeTrait<T1>::Type;
+  MT factor;
+  HOSTDEVICE inline DstMaskGenerator(const float dropout_prob,
+                                     const bool is_upscale_in_train)
+      : dropout_prob_(dropout_prob), is_upscale_in_train_(is_upscale_in_train) {
+    factor = static_cast<MT>(1.0f / (1.0f - dropout_prob_));
+  }
 
-template <typename T, typename MaskType>
-__global__ void RandomGenerator(const size_t n, uint64_t seed,
-                                const float dropout_prob, const T* src,
-                                MaskType* mask, T* dst,
-                                bool is_upscale_in_train, uint64_t increment) {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-#ifdef PADDLE_WITH_HIP
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx, increment, &state);
-#else
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
-#endif
-
-  MaskType mask_val;
-  T dst_val;
-  MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-  for (; idx < n; idx += blockDim.x * gridDim.x) {
-    T src_val = src[idx];
-#ifdef PADDLE_WITH_HIP
-    if (hiprand_uniform(&state) < dropout_prob) {
-#else
-    if (curand_uniform(&state) < dropout_prob) {
-#endif
-      mask_val = 0;
-      dst_val = 0;
-    } else {
-      mask_val = 1;
-      dst_val = is_upscale_in_train
-                    ? static_cast<T>(static_cast<MT>(src_val) * factor)
-                    : src_val;
+  HOSTDEVICE inline void operator()(OutT* dst, const T1* src_val,
+                                    const T2* rand, int num) const {
+    static constexpr int kCount =
+        phi::funcs::uniform_distribution<T2>::kReturnsCount;
+// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
+#pragma unroll
+    for (int i = 0; i < kCount; i++) {
+      if (rand[i] < dropout_prob_) {
+        dst[i] = static_cast<T1>(0);
+        dst[i + kCount] = dst[i];
+      } else {
+        dst[i] = is_upscale_in_train_
+                     ? static_cast<T1>(static_cast<MT>(src_val[i]) * factor)
+                     : static_cast<T1>(src_val[i]);
+        dst[i + kCount] = static_cast<T1>(1);
+      }
     }
-    mask[idx] = mask_val;
-    dst[idx] = dst_val;
   }
-}
+};
 
-template <typename T, typename MaskType, int VecSize>
+template <typename T, typename MaskType>
 __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                           const float dropout_prob,
                                           const T* src, MaskType* mask, T* dst,
                                           bool is_upscale_in_train,
-                                          uint64_t increment) {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
-
+                                          uint64_t increment,
+                                          size_t main_offset) {
+  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
+  static constexpr int kCount =
+      phi::funcs::uniform_distribution<float>::kReturnsCount;
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * kCount;
 #ifdef PADDLE_WITH_HIP
-  int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
   hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx, increment, &state);
+  hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = hiprandStatePhilox4_32_10_t;
 #else
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
-#endif
-
-  MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-  for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
-    LoadT src_val;
-    phi::Load<T, VecSize>(&src[i], &src_val);
-
-#ifdef PADDLE_WITH_HIP
-    float4 rand = hiprand_uniform4(&state);
-#else
-    float4 rand = curand_uniform4(&state);
+  curand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = curandStatePhilox4_32_10_t;
 #endif
-
-    LoadT dst_val;
-    MaskLoadT mask_val;
-
-#pragma unroll
-    for (int j = 0; j < VecSize; j++) {
-      if ((&rand.x)[j] < dropout_prob) {
-        dst_val[j] = 0;
-        mask_val[j] = 0;
-      } else {
-        dst_val[j] = is_upscale_in_train
-                         ? static_cast<T>(static_cast<MT>(src_val[j]) * factor)
-                         : src_val[j];
-        mask_val[j] = 1;
-      }
-    }
-
-    phi::Store<T, VecSize>(dst_val, &dst[i]);
-    phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
+  T dst_mask[kCount * 2];  // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
+  float rands[kCount];
+  MaskType mask_result[kCount];
+  using Rand = phi::funcs::uniform_distribution<float>;
+  using Cast = kps::IdentityFunctor<T>;
+  int deal_size = BLOCK_NUM_X * kCount;
+  auto dst_functor =
+      DstMaskGenerator<T, float>(dropout_prob, is_upscale_in_train);
+  size_t fix = idx * kCount;
+  for (; fix < main_offset; fix += stride) {
+    kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
+        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
+    kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[kCount], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
+                                                  deal_size);
   }
-}
-
-template <typename T, typename MaskType>
-struct CudaDropoutGradFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
-
-  explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
-
-  __device__ __forceinline__ T operator()(const T dout,
-                                          const MaskType mask) const {
-    return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
-                          factor_);
-  }
-
- private:
-  MT factor_;
-};
-
-template <typename T, typename MaskType, int VecSize>
-__global__ void DropoutGradCUDAKernel(
-    const T* dout, const MaskType* mask,
-    const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
-    T* dx) {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
-
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
-    LoadT dout_val;
-    phi::Load<T, VecSize>(&dout[i], &dout_val);
-
-    MaskLoadT mask_val;
-    phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
-
-    LoadT dx_val;
-
-#pragma unroll
-    for (int j = 0; j < VecSize; j++) {
-      dx_val[j] = static_cast<T>(static_cast<MT>(dout_val[j]) *
-                                 static_cast<MT>(mask_val[j]) * factor);
-    }
-
-    phi::Store<T, VecSize>(dx_val, &dx[i]);
+  int remainder = n - fix;
+  if (remainder > 0) {
+    kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
+        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
+    kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[kCount], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
+                                                 remainder);
   }
 }
 
@@ -218,42 +174,21 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
     uint64_t seed_data;
     uint64_t increment;
     // VectorizedRandomGenerator use curand_uniform4, so we only support
-    // vec_size is 4;
-    int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
+    // kVecSize is 4;
+    constexpr int kVecSize =
+        phi::funcs::uniform_distribution<float>::kReturnsCount;
     auto gpu_config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
     auto offset =
-        ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
-
+        ((x_numel - 1) / (gpu_config.GetThreadNum() * kVecSize) + 1) * kVecSize;
     GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
                             &seed_data, &increment);
-
-#ifdef __HIPCC__
-    if (vec_size == 4 && size % 4 == 0) {
-      hipLaunchKernelGGL(
-          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
-          gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size,
-          seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
-          increment);
-    } else {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
-                         gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
-                         stream, size, seed_data, dropout_prob, x_data,
-                         mask_data, y_data, upscale_in_train, increment);
-    }
-#else
-    if (vec_size == 4 && size % 4 == 0) {
-      VectorizedRandomGenerator<T, uint8_t, 4><<<
-          gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
-          size, seed_data, dropout_prob, x_data, mask_data, y_data,
-          upscale_in_train, increment);
-    } else {
-      RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
-                                    gpu_config.thread_per_block, 0, stream>>>(
-          size, seed_data, dropout_prob, x_data, mask_data, y_data,
-          upscale_in_train, increment);
-    }
-#endif
+    size_t main_offset = size / (gpu_config.GetBlockSize() * kVecSize) *
+                         (gpu_config.GetBlockSize() * kVecSize);
+    VectorizedRandomGenerator<T, uint8_t><<<
+        gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream>>>(
+        size, seed_data, dropout_prob, x_data, mask_data, y_data,
+        upscale_in_train, increment, main_offset);
   } else {
     if (upscale_in_train) {
 // todo: can y share with data with x directly?
@@ -278,6 +213,22 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
   }
 }
 
+template <typename T, typename MaskType>
+struct CudaDropoutGradFunctor {
+  using MT = typename details::MPTypeTrait<T>::Type;
+
+  explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
+
+  __device__ __forceinline__ T operator()(const T dout,
+                                          const MaskType mask) const {
+    return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
+                          factor_);
+  }
+
+ private:
+  MT factor_;
+};
+
 template <typename T>
 void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 const std::string dropout_implementation,
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index c88a8fe196edf8c031d40b13ad1a9295ce08cd9c..c0ec44909a5f3fc868a8dc1a0657305bbcbb92ad 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -58,19 +58,15 @@ __global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
 }
 
 template <typename T>
-__global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale,
-                                             T max_range, const int num,
-                                             const int cin, const int cout,
-                                             T* out) {
-  int bid = blockIdx.x;
-  T s = scale[bid % cout];
-
-  int wh_size = num / (cin * cout);
-  const T* in_current = in + bid * wh_size;
-  T* out_current = out + bid * wh_size;
-
-  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
-    out_current[i] = in_current[i] * s / max_range;
+__global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
+                                             const T max_range,
+                                             const int64_t num,
+                                             const int n_scales,
+                                             const int quant_stride, T* out) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / quant_stride) % n_scales];
+    out[i] = in[i] * s / max_range;
   }
 }
 
@@ -98,20 +94,32 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
     const T* in_data = in->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
     if (scale_num == 1) {
-      int num = in->numel();
+      int64_t num = in->numel();
       const T* scale_factor = scales[0]->data<T>();
       if (quant_axis == 0) {
         int grid = in_dims[0];
         int block = 1024;
         DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
             in_data, scale_factor, max_range, num, in_dims[0], out_data);
-      } else if (quant_axis == 1) {
-        // Dequantize weight of Cin * Cout * W * H
-        int grid = in_dims[0] * in_dims[1];
-        int block = 1024;
-        DequantizeOneScaleQuantAxis1<T><<<grid, block, 0, dev_ctx.stream()>>>(
-            in_data, scale_factor, max_range, num, in_dims[0], in_dims[1],
-            out_data);
+      } else {
+        int quant_stride = 1;
+        for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+          quant_stride *= in_dims[i];
+        }
+
+        int64_t block_size = std::min(
+            num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
+        int64_t max_threads =
+            dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+        const int64_t max_blocks = std::max(
+            ((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+        const int64_t grid_size =
+            std::min(max_blocks, (num + block_size - 1) / block_size);
+
+        DequantizeOneScaleQuantAxisN<
+            T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[quant_axis],
+            quant_stride, out_data);
       }
     } else if (scale_num == 2) {
       // Not need to consider quant_axis
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 70597be393c35e6939b83d86ce2f9be8f2c36805..01384a6cafef980e2706ce4ebecca7c59ba79e4e 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -273,18 +273,18 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
 template <typename T>
 __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
                                                     const int bin_cnt,
-                                                    const int n, const int c,
-                                                    T* out) {
+                                                    const int64_t n,
+                                                    const int c, T* out) {
   int tid = threadIdx.x;
 
-  int channel_size = n / c;
+  int64_t channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
   T* out_c = out + blockIdx.x * channel_size;
 
   T s = scale[blockIdx.x];
   T inv_s = inverse(s);
 
-  for (int i = tid; i < channel_size; i += blockDim.x) {
+  for (int64_t i = tid; i < channel_size; i += blockDim.x) {
     T x = in_c[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
@@ -293,25 +293,20 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
   }
 }
 
-// ChannelClipAndQuantKernel for quant_axis is 1
+// ChannelClipAndQuantKernel for quant_axis is N
 template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxis1(const T* in, const T* scale,
-                                                    const int bin_cnt,
-                                                    const int n, const int cin,
-                                                    const int cout, T* out) {
-  T s = scale[blockIdx.x % cout];
-  T inv_s = inverse(s);
-
-  int wh_size = n / (cin * cout);
-  const T* in_c = in + blockIdx.x * wh_size;
-  T* out_c = out + blockIdx.x * wh_size;
-
-  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
-    T x = in_c[i];
+__global__ void ChannelClipAndQuantKernelQuantAxisN(
+    const T* in, const T* scale, const int bin_cnt, const int64_t n,
+    const int nScale, const int quant_stride, T* out) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / quant_stride) % nScale];
+    T inv_s = 1.0 / s;
+    T x = in[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt * inv_s * v;
-    out_c[i] = round(v);
+    out[i] = round(v);
   }
 }
 
@@ -327,7 +322,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
                                           "the received is %d",
                                           quant_axis));
 
-    int num = in.numel();
+    int64_t num = in.numel();
     auto in_dims = in.dims();
     const T* in_data = in.data<T>();
     const T* scale_data = scale.data<T>();
@@ -338,11 +333,24 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
       int block = 1024;
       ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
           in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
-    } else if (quant_axis == 1) {
-      int grid = in_dims[0] * in_dims[1];
-      int block = 1024;
-      ChannelClipAndQuantKernelQuantAxis1<T><<<grid, block, 0, ctx.stream()>>>(
-          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    } else {
+      int quant_stride = 1;
+      for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+        quant_stride *= in_dims[i];
+      }
+      int64_t block_size =
+          std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
+      int64_t max_threads =
+          ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
+                                          static_cast<int64_t>(1));
+
+      const int64_t grid_size =
+          std::min(max_blocks, (num + block_size - 1) / block_size);
+
+      ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
+          out_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc
index 9aa71e094484d65519610f27e6969ba3b7b257ee..2ff9beb36f284b04e27123316467ce626682a502 100644
--- a/paddle/fluid/operators/frame_op.cc
+++ b/paddle/fluid/operators/frame_op.cc
@@ -64,18 +64,26 @@ class FrameOp : public framework::OperatorWithKernel {
       end_axis = x_rank - 2;
     }
 
-    PADDLE_ENFORCE_LE(frame_length, seq_length,
-                      platform::errors::InvalidArgument(
-                          "Attribute(frame_length) of FrameOp should be less "
-                          "equal than sequence length, but got (%s) > (%s).",
-                          frame_length, seq_length));
+    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
+    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    if (check) {
+      PADDLE_ENFORCE_LE(frame_length, seq_length,
+                        platform::errors::InvalidArgument(
+                            "Attribute(frame_length) of FrameOp should be less "
+                            "equal than sequence length, but got (%s) > (%s).",
+                            frame_length, seq_length));
+    }
 
     // It won't go into for loop when x_rank == 1U.
     for (int i = start_axis; i <= end_axis; i++) {
       output_shape.push_back(x_dims[i]);
     }
 
-    n_frames = 1 + (seq_length - frame_length) / hop_length;
+    if (seq_length == -1) {
+      n_frames = -1;
+    } else {
+      n_frames = 1 + (seq_length - frame_length) / hop_length;
+    }
 
     if (axis == 0) {
       // (n_frames, frame_length, ...)
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 83fe1aa6dd1482bfa2a3ba99f488e4ec66d2b49f..785b16ae283b9c5472ff6797a9faa6b3e287c6f5 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -98,9 +98,17 @@ REGISTER_OP_CPU_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanKernel<paddle::platform::CPUDeviceContext, double>,
     ops::MeanKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::bfloat16>);
+                    paddle::platform::bfloat16>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<float>>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::bfloat16>);
+                        paddle::platform::bfloat16>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 01a5632a960c3611e0638200e7130ed8de879426..e8964765ec6549c106f877341b3d013cfe102e25 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -102,10 +102,17 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     mean, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, float>,
     ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
-                            plat::float16>);
+                            paddle::platform::complex<float>>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/overlap_add_op.cc b/paddle/fluid/operators/overlap_add_op.cc
index adae2c8f8adaae2ae24011cc5405d308e2168998..0e6f0f8422106b03cb545fd76369c7966a83370b 100644
--- a/paddle/fluid/operators/overlap_add_op.cc
+++ b/paddle/fluid/operators/overlap_add_op.cc
@@ -54,6 +54,7 @@ class OverlapAddOp : public framework::OperatorWithKernel {
     std::vector<int64_t> output_shape;
     int n_frames;
     int frame_length;
+    int seq_length;
 
     int start_axis;
     int end_axis;
@@ -69,14 +70,22 @@ class OverlapAddOp : public framework::OperatorWithKernel {
       end_axis = x_rank - 3;
     }
 
-    PADDLE_ENFORCE_LE(
-        hop_length, frame_length,
-        platform::errors::InvalidArgument(
-            "Attribute(hop_length) of OverlapAddOp should be less or equal "
-            "than frame_length, but got hop_length(%s) > frame_length(%s).",
-            hop_length, frame_length));
+    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
+    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    if (check) {
+      PADDLE_ENFORCE_LE(
+          hop_length, frame_length,
+          platform::errors::InvalidArgument(
+              "Attribute(hop_length) of OverlapAddOp should be less or equal "
+              "than frame_length, but got hop_length(%s) > frame_length(%s).",
+              hop_length, frame_length));
+    }
 
-    const int seq_length = (n_frames - 1) * hop_length + frame_length;
+    if (n_frames == -1) {
+      seq_length = -1;
+    } else {
+      seq_length = (n_frames - 1) * hop_length + frame_length;
+    }
 
     // It won't go into for loop when x_rank == 2U.
     for (int i = start_axis; i <= end_axis; i++) {
diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h
index 39639768241d4986af75455b7bb5d91b444be3e0..c9889ad539d0810c507e454060dab82e96c75e76 100644
--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
@@ -16,451 +16,469 @@
 
 #include "paddle/fluid/operators/spectral_op.h"
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/hipfft.h"
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cufft.h"
+#if defined(PADDLE_WITH_ONEMKL)
+#include "paddle/phi/backends/dynload/mklrt.h"
+#elif defined(PADDLE_WITH_POCKETFFT)
+#include "extern_pocketfft/pocketfft_hdronly.h"
 #endif
 
 namespace paddle {
 namespace operators {
-using ScalarType = framework::proto::VarType::Type;
-const int64_t kMaxFFTNdim = 3;
-const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
-// This struct is used to easily compute hashes of the
-// parameters. It will be the **key** to the plan cache.
-struct FFTConfigKey {
-  // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
-  int64_t signal_ndim_;
-  // These include additional batch dimension as well.
-  int64_t sizes_[kMaxDataNdim];
-  int64_t input_shape_[kMaxDataNdim];
-  int64_t output_shape_[kMaxDataNdim];
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
-
-  FFTConfigKey() = default;
-
-  FFTConfigKey(const std::vector<int64_t>& in_shape,
-               const std::vector<int64_t>& out_shape,
-               const std::vector<int64_t>& signal_size,
-               FFTTransformType fft_type, ScalarType value_type) {
-    // Padding bits must be zeroed for hashing
-    memset(this, 0, sizeof(*this));
-    signal_ndim_ = signal_size.size() - 1;
-    fft_type_ = fft_type;
-    value_type_ = value_type;
-
-    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
-    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
-    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
-  }
-};
-
-#if defined(PADDLE_WITH_CUDA)
-// An RAII encapsulation of cuFFTHandle
-class CuFFTHandle {
-  ::cufftHandle handle_;
 
- public:
-  CuFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
-  }
-
-  CuFFTHandle(const CuFFTHandle& other) = delete;
-  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
+using Tensor = framework::Tensor;
 
-  CuFFTHandle(CuFFTHandle&& other) = delete;
-  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
+// FFT Functors
+#if defined(PADDLE_WITH_ONEMKL)
 
-  ::cufftHandle& get() { return handle_; }
-  const ::cufftHandle& get() const { return handle_; }
+#define MKL_DFTI_CHECK(expr)                                                   \
+  do {                                                                         \
+    MKL_LONG status = (expr);                                                  \
+    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))                  \
+      PADDLE_THROW(                                                            \
+          platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
+  } while (0);
 
-  ~CuFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
+struct DftiDescriptorDeleter {
+  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
+    if (handle != nullptr) {
+      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
+    }
   }
 };
 
-using plan_size_type = long long int;  // NOLINT
-// This class contains all the information needed to execute a cuFFT plan:
-//   1. the plan
-//   2. the workspace size needed
-class FFTConfig {
+// A RAII wrapper for MKL_DESCRIPTOR*
+class DftiDescriptor {
  public:
-  // Only move semantics is enought for this class. Although we already use
-  // unique_ptr for the plan, still remove copy constructor and assignment op so
-  // we don't accidentally copy and take perf hit.
-  explicit FFTConfig(const FFTConfigKey& plan_key)
-      : FFTConfig(
-            std::vector<int64_t>(plan_key.sizes_,
-                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
-            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
-
-  // sizes are full signal, including batch size and always two-sided
-  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
-            FFTTransformType fft_type, ScalarType dtype)
-      : fft_type_(fft_type), value_type_(dtype) {
-    // signal sizes (excluding batch dim)
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-
-    // input batch size
-    const auto batch = static_cast<plan_size_type>(sizes[0]);
-    // const int64_t signal_ndim = sizes.size() - 1;
-    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
-                      platform::errors::InvalidArgument(
-                          "The signal_ndim must be equal to sizes.size() - 1,"
-                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
-                          signal_ndim, sizes.size() - 1));
-
-    cudaDataType itype, otype, exec_type;
-    const auto complex_input = has_complex_input(fft_type);
-    const auto complex_output = has_complex_output(fft_type);
-    if (dtype == framework::proto::VarType::FP32) {
-      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
-      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
-      exec_type = CUDA_C_32F;
-    } else if (dtype == framework::proto::VarType::FP64) {
-      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
-      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
-      exec_type = CUDA_C_64F;
-    } else if (dtype == framework::proto::VarType::FP16) {
-      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
-      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
-      exec_type = CUDA_C_16F;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "cuFFT only support transforms of type float16, float32 and "
-          "float64"));
-    }
-
-    // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
-        plan(), /* autoAllocate */ 0));
-
-    size_t ws_size_t;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
-        plan(), signal_ndim, signal_sizes.data(),
-        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
-        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
-        batch, &ws_size_t, exec_type));
-
-    ws_size = ws_size_t;
+  void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
+            MKL_LONG signal_ndim, MKL_LONG* sizes) {
+    PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
+                      platform::errors::AlreadyExists(
+                          "DftiDescriptor has already been initialized."));
+
+    DFTI_DESCRIPTOR* raw_desc;
+    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
+        &raw_desc, precision, signal_type, signal_ndim, sizes));
+    desc_.reset(raw_desc);
   }
 
-  FFTConfig(const FFTConfig& other) = delete;
-  FFTConfig& operator=(const FFTConfig& other) = delete;
-
-  FFTConfig(FFTConfig&& other) = delete;
-  FFTConfig& operator=(FFTConfig&& other) = delete;
-
-  const cufftHandle& plan() const { return plan_ptr.get(); }
-
-  FFTTransformType transform_type() const { return fft_type_; }
-  ScalarType data_type() const { return value_type_; }
-  size_t workspace_size() const { return ws_size; }
+  DFTI_DESCRIPTOR* get() const {
+    DFTI_DESCRIPTOR* raw_desc = desc_.get();
+    PADDLE_ENFORCE_NOT_NULL(raw_desc,
+                            platform::errors::PreconditionNotMet(
+                                "DFTI DESCRIPTOR has not been initialized."));
+    return raw_desc;
+  }
 
  private:
-  CuFFTHandle plan_ptr;
-  size_t ws_size;
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
+  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
 };
 
-#elif defined(PADDLE_WITH_HIP)
-// An RAII encapsulation of cuFFTHandle
-class HIPFFTHandle {
-  ::hipfftHandle handle_;
-
- public:
-  HIPFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+static DftiDescriptor _plan_mkl_fft(
+    const framework::proto::VarType::Type& in_dtype,
+    const framework::proto::VarType::Type& out_dtype,
+    const framework::DDim& in_strides, const framework::DDim& out_strides,
+    const std::vector<int>& signal_sizes, FFTNormMode normalization,
+    bool forward) {
+  const DFTI_CONFIG_VALUE precision = [&] {
+    switch (in_dtype) {
+      case framework::proto::VarType::FP32:
+        return DFTI_SINGLE;
+      case framework::proto::VarType::COMPLEX64:
+        return DFTI_SINGLE;
+      case framework::proto::VarType::FP64:
+        return DFTI_DOUBLE;
+      case framework::proto::VarType::COMPLEX128:
+        return DFTI_DOUBLE;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid input datatype (%s), input data type should be FP32, "
+            "FP64, COMPLEX64 or COMPLEX128.",
+            framework::DataTypeToString(in_dtype)));
+    }
+  }();
+
+  // C2C, R2C, C2R
+  const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
+  const DFTI_CONFIG_VALUE domain =
+      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
+
+  DftiDescriptor descriptor;
+  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
+  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
+  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
+
+  // placement inplace or not inplace
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
+                                            DFTI_NOT_INPLACE));
+
+  // number of transformations
+  const MKL_LONG batch_size = fft_sizes[0];
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
+
+  // input & output distance
+  const MKL_LONG idist = in_strides[0];
+  const MKL_LONG odist = out_strides[0];
+  MKL_DFTI_CHECK(
+      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
+                                            DFTI_OUTPUT_DISTANCE, odist));
+
+  // input & output stride
+  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
+  std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
+  for (MKL_LONG i = 1; i <= signal_ndim; i++) {
+    mkl_in_stride[i] = in_strides[i];
+    mkl_out_stride[i] = out_strides[i];
   }
-
-  HIPFFTHandle(const HIPFFTHandle& other) = delete;
-  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
-
-  HIPFFTHandle(HIPFFTHandle&& other) = delete;
-  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
-
-  ::hipfftHandle& get() { return handle_; }
-  const ::hipfftHandle& get() const { return handle_; }
-
-  ~HIPFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
+
+  // conjugate even storage
+  if (!(fft_type == FFTTransformType::C2C)) {
+    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+        descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
   }
-};
-using plan_size_type = int;
-// This class contains all the information needed to execute a cuFFT plan:
-//   1. the plan
-//   2. the workspace size needed
-class FFTConfig {
- public:
-  // Only move semantics is enought for this class. Although we already use
-  // unique_ptr for the plan, still remove copy constructor and assignment op so
-  // we don't accidentally copy and take perf hit.
-  explicit FFTConfig(const FFTConfigKey& plan_key)
-      : FFTConfig(
-            std::vector<int64_t>(plan_key.sizes_,
-                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
-            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
-
-  // sizes are full signal, including batch size and always two-sided
-  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
-            FFTTransformType fft_type, ScalarType dtype)
-      : fft_type_(fft_type), value_type_(dtype) {
-    // signal sizes (excluding batch dim)
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-
-    // input batch size
-    const auto batch = static_cast<plan_size_type>(sizes[0]);
-    // const int64_t signal_ndim = sizes.size() - 1;
-    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
-                      platform::errors::InvalidArgument(
-                          "The signal_ndim must be equal to sizes.size() - 1,"
-                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
-                          signal_ndim, sizes.size() - 1));
-
-    hipfftType exec_type = [&] {
-      if (dtype == framework::proto::VarType::FP32) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_C2C;
-          case FFTTransformType::R2C:
-            return HIPFFT_R2C;
-          case FFTTransformType::C2R:
-            return HIPFFT_C2R;
-        }
-      } else if (dtype == framework::proto::VarType::FP64) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_Z2Z;
-          case FFTTransformType::R2C:
-            return HIPFFT_D2Z;
-          case FFTTransformType::C2R:
-            return HIPFFT_Z2D;
-        }
+
+  MKL_LONG signal_numel =
+      std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
+                      std::multiplies<MKL_LONG>());
+  if (normalization != FFTNormMode::none) {
+    const double scale =
+        ((normalization == FFTNormMode::by_sqrt_n)
+             ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
+             : 1.0 / static_cast<double>(signal_numel));
+    const auto scale_direction = [&]() {
+      if (fft_type == FFTTransformType::R2C ||
+          (fft_type == FFTTransformType::C2C && forward)) {
+        return DFTI_FORWARD_SCALE;
+      } else {
+        // (fft_type == FFTTransformType::C2R ||
+        //          (fft_type == FFTTransformType::C2C && !forward))
+        return DFTI_BACKWARD_SCALE;
       }
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "hipFFT only support transforms of type float32 and float64"));
     }();
-
-    // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
-        plan(), /* autoAllocate */ 0));
-
-    size_t ws_size_t;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
-        plan(), signal_ndim, signal_sizes.data(),
-        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
-        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
-        batch, &ws_size_t));
-
-    ws_size = ws_size_t;
+    MKL_DFTI_CHECK(
+        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
   }
 
-  const hipfftHandle& plan() const { return plan_ptr.get(); }
-
-  FFTTransformType transform_type() const { return fft_type_; }
-  ScalarType data_type() const { return value_type_; }
-  size_t workspace_size() const { return ws_size; }
+  // commit the descriptor
+  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
+  return descriptor;
+}
 
- private:
-  HIPFFTHandle plan_ptr;
-  size_t ws_size;
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
-};
-#endif
+// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
+template <typename DeviceContext, typename Ti, typename To>
+void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
+              const std::vector<int64_t>& axes, FFTNormMode normalization,
+              bool forward) {
+  const framework::DDim& in_sizes = x->dims();
+  const int ndim = in_sizes.size();
+  const int signal_ndim = axes.size();
+  const int batch_ndim = ndim - signal_ndim;
+  const framework::DDim& out_sizes = out->dims();
+
+  // make a dim permutation
+  std::vector<int> dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), 0);
+  std::vector<bool> is_transformed_dim(ndim, false);
+  for (const auto& d : axes) {
+    is_transformed_dim[d] = true;
+  }
+  const auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(),
+                     [&](size_t axis) { return !is_transformed_dim[axis]; });
+  std::copy(axes.cbegin(), axes.cend(), batch_end);
+
+  // transpose input according to that permutation
+  framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
+  std::vector<int64_t> transposed_input_shape_ =
+      phi::vectorize(transposed_input_shape);
+  framework::Tensor transposed_input;
+  transposed_input.Resize(transposed_input_shape);
+  const auto place = ctx.GetPlace();
+  transposed_input.mutable_data<Ti>(place);
+  TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
+                                               dim_permute);
+
+  // make an collapsed input: collapse batch axes for input
+  const int batch_size = std::accumulate(
+      transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
+      1L, std::multiplies<int64_t>());
+  std::vector<int> collapsed_input_shape_(1 + signal_ndim);
+  collapsed_input_shape_[0] = batch_size;
+  std::copy(transposed_input_shape_.begin() + batch_ndim,
+            transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
+  const framework::DDim collapsed_input_shape =
+      phi::make_ddim(collapsed_input_shape_);
+  transposed_input.Resize(collapsed_input_shape);
+  framework::Tensor& collapsed_input = transposed_input;
+
+  // make a collapsed output
+  std::vector<int> collapsed_output_shape_(1 + signal_ndim);
+  collapsed_output_shape_[0] = batch_size;
+  for (int i = 0; i < signal_ndim; i++) {
+    collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
+  }
+  const framework::DDim collapsed_output_shape =
+      phi::make_ddim(collapsed_output_shape_);
+  framework::Tensor collapsed_output;
+  collapsed_output.Resize(collapsed_output_shape);
+  collapsed_output.mutable_data(place, out->type());
+
+  // signal sizes
+  std::vector<int> signal_sizes(1 + signal_ndim);
+  signal_sizes[0] = batch_size;
+  for (int i = 0; i < signal_ndim; i++) {
+    signal_sizes[1 + i] =
+        std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
+  }
 
-// Hashing machinery for Key
-// Fowler–Noll–Vo hash function
-// see
-// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
-template <typename Key>
-struct KeyHash {
-  // Key must be a POD because we read out its memory
-  // contenst as char* when hashing
-  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
-
-  size_t operator()(const Key& params) const {
-    auto ptr = reinterpret_cast<const uint8_t*>(&params);
-    uint32_t value = 0x811C9DC5;
-    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
-      value ^= ptr[i];
-      value *= 0x01000193;
+  // input & output stride
+  const framework::DDim input_stride = phi::stride(collapsed_input_shape);
+  const framework::DDim output_stride = phi::stride(collapsed_output_shape);
+
+  // make a DFTI_DESCRIPTOR
+  DftiDescriptor desc =
+      _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
+                    framework::TransToProtoVarType(out->dtype()), input_stride,
+                    output_stride, signal_sizes, normalization, forward);
+
+  const FFTTransformType fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
+                          framework::TransToProtoVarType(out->type()));
+  if (fft_type == FFTTransformType::C2R && forward) {
+    framework::Tensor collapsed_input_conj(collapsed_input.dtype());
+    collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
+                                          ctx.GetPlace());
+    // conjugate the input
+    platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
+    phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
+                                        collapsed_input.numel(),
+                                        collapsed_input_conj.data<Ti>());
+    for_range(functor);
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+        desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    framework::Tensor collapsed_output_conj(collapsed_output.dtype());
+    collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
+                                           ctx.GetPlace());
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+        desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
+    // conjugate the output
+    platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
+    phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
+                                        collapsed_output.numel(),
+                                        collapsed_output.data<To>());
+    for_range(functor);
+  } else {
+    if (forward) {
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
+    } else {
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
     }
-    return static_cast<size_t>(value);
   }
-};
 
-template <typename Key>
-struct KeyEqual {
-  // Key must be a POD because we read out its memory
-  // contenst as char* when comparing
-  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+  // resize for the collapsed output
+  framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
+  collapsed_output.Resize(transposed_output_shape);
+  framework::Tensor& transposed_output = collapsed_output;
 
-  bool operator()(const Key& a, const Key& b) const {
-    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
-    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
-    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
+  // reverse the transposition
+  std::vector<int> reverse_dim_permute(ndim);
+  for (int i = 0; i < ndim; i++) {
+    reverse_dim_permute[dim_permute[i]] = i;
   }
-};
-
-#if CUDA_VERSION < 10000
-// Note that the max plan number for CUDA version < 10 has to be 1023
-// due to a bug that fails on the 1024th plan
-constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
-constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
-#else
-constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
-// The default max cache size chosen for CUDA version > 10 is arbitrary.
-// This number puts a limit on how big of a plan cache should we maintain by
-// default. Users can always configure it via cufft_set_plan_cache_max_size.
-constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
-#endif
-static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
-                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
-              "CUFFT_MAX_PLAN_NUM not in size_t range");
-static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
-                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
-              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
-
-// This cache assumes that the mapping from key to value never changes.
-// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
-// value returned from try_emplace_value.
-// The contract of using this cache is that try_emplace_value should only be
-// used when the max_size is positive.
-class FFTConfigCache {
- public:
-  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
-  using map_t = typename std::unordered_map<
-      std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
-      KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
-  using map_kkv_iter_t = typename map_t::iterator;
-
-  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
-
-  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
-
-  FFTConfigCache(const FFTConfigCache& other) = delete;
-  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
-
-  FFTConfigCache(FFTConfigCache&& other) noexcept
-      : _usage_list(std::move(other._usage_list)),
-        _cache_map(std::move(other._cache_map)),
-        _max_size(other._max_size) {}
+  TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
+                                               out, reverse_dim_permute);
+}
 
-  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
-    _usage_list = std::move(other._usage_list);
-    _cache_map = std::move(other._cache_map);
-    _max_size = other._max_size;
-    return *this;
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                 normalization, forward);
   }
+};
 
-  // If key is in this cache, return the cached config. Otherwise, emplace the
-  // config in this cache and return it.
-  FFTConfig& lookup(FFTConfigKey params) {
-    PADDLE_ENFORCE_GT(_max_size, 0,
-                      platform::errors::InvalidArgument(
-                          "The max size of FFTConfigCache must be great than 0,"
-                          "But received is [%d]",
-                          _max_size));
-
-    map_kkv_iter_t map_it = _cache_map.find(params);
-    // Hit, put to list front
-    if (map_it != _cache_map.end()) {
-      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
-      return map_it->second->second;
-    }
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                 normalization, forward);
+  }
+};
 
-    // Miss
-    // remove if needed
-    if (_usage_list.size() >= _max_size) {
-      auto last = _usage_list.end();
-      last--;
-      _cache_map.erase(last->first);
-      _usage_list.pop_back();
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    if (axes.size() > 1) {
+      const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
+      Tensor temp;
+      temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
+
+      FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
+
+      const std::vector<int64_t> new_axes{axes.back()};
+      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
+                                                   normalization, forward);
+    } else {
+      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                   normalization, forward);
     }
-
-    // construct new plan at list front, then insert into _cache_map
-    _usage_list.emplace_front(std::piecewise_construct,
-                              std::forward_as_tuple(params),
-                              std::forward_as_tuple(params));
-    auto kv_it = _usage_list.begin();
-    _cache_map.emplace(std::piecewise_construct,
-                       std::forward_as_tuple(kv_it->first),
-                       std::forward_as_tuple(kv_it));
-    return kv_it->second;
   }
-
-  void clear() {
-    _cache_map.clear();
-    _usage_list.clear();
+};
+#elif defined(PADDLE_WITH_POCKETFFT)
+
+template <typename T>
+T compute_factor(int64_t size, FFTNormMode normalization) {
+  constexpr auto one = static_cast<T>(1);
+  switch (normalization) {
+    case FFTNormMode::none:
+      return one;
+    case FFTNormMode::by_n:
+      return one / static_cast<T>(size);
+    case FFTNormMode::by_sqrt_n:
+      return one / std::sqrt(static_cast<T>(size));
   }
+  PADDLE_THROW(
+      platform::errors::InvalidArgument("Unsupported normalization type"));
+}
 
-  void resize(int64_t new_size) {
-    _set_max_size(new_size);
-    auto cur_size = _usage_list.size();
-    if (cur_size > _max_size) {
-      auto delete_it = _usage_list.end();
-      for (size_t i = 0; i < cur_size - _max_size; i++) {
-        delete_it--;
-        _cache_map.erase(delete_it->first);
-      }
-      _usage_list.erase(delete_it, _usage_list.end());
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = typename Ti::value_type;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    const int64_t data_size = sizeof(C);
+    std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                   [&](std::ptrdiff_t s) { return s * data_size; });
+
+    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= in_sizes[i];
     }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
+                   out_data, factor);
   }
+};
 
-  size_t size() const { return _cache_map.size(); }
-
-  size_t max_size() const noexcept { return _max_size; }
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = Ti;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
-  std::mutex mutex;
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(out_strides.begin(), out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
- private:
-  // Only sets size and does value check. Does not resize the data structures.
-  void _set_max_size(int64_t new_size) {
-    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
-    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
-    // first.
-    PADDLE_ENFORCE_GE(
-        new_size, 0,
-        platform::errors::InvalidArgument(
-            "cuFFT plan cache size must be non-negative, But received is [%d]",
-            new_size));
-    PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
-                      platform::errors::InvalidArgument(
-                          "cuFFT plan cache size can not be larger than [%d], "
-                          "But received is [%d]",
-                          CUFFT_MAX_PLAN_NUM, new_size));
-    _max_size = static_cast<size_t>(new_size);
+    const auto* in_data = x->data<R>();
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= in_sizes[i];
+    }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
+                   out_data, factor);
   }
-
-  std::list<kv_t> _usage_list;
-  map_t _cache_map;
-  size_t _max_size;
 };
 
-static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
-static std::mutex plan_caches_mutex;
-
-static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
-  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = To;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
-  if (device_index >= plan_caches.size()) {
-    plan_caches.resize(device_index + 1);
-  }
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(out_strides.begin(), out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
-  if (!plan_caches[device_index]) {
-    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
+    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
+    auto* out_data = out->data<R>();
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= out_sizes[i];
+    }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
+                   out_data, factor);
   }
+};
 
-  return *plan_caches[device_index];
-}
+#endif
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index db3dc214bfe7ae7ae7facc59deca71ce9dfe91f6..0270f7e0576c8ea85e8464c4bc6236434210a4a3 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -13,28 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/spectral_op.h"
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-#if defined(PADDLE_WITH_ONEMKL)
-#include "paddle/phi/backends/dynload/mklrt.h"
-#elif defined(PADDLE_WITH_POCKETFFT)
-#include "extern_pocketfft/pocketfft_hdronly.h"
-#endif
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
+#include "paddle/fluid/operators/spectral_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -355,465 +334,6 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
       norm));
 }
 
-// FFT Functors
-#if defined(PADDLE_WITH_ONEMKL)
-
-#define MKL_DFTI_CHECK(expr)                                                   \
-  do {                                                                         \
-    MKL_LONG status = (expr);                                                  \
-    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))                  \
-      PADDLE_THROW(                                                            \
-          platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
-  } while (0);
-
-namespace {
-
-struct DftiDescriptorDeleter {
-  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
-    if (handle != nullptr) {
-      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
-    }
-  }
-};
-
-// A RAII wrapper for MKL_DESCRIPTOR*
-class DftiDescriptor {
- public:
-  void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
-            MKL_LONG signal_ndim, MKL_LONG* sizes) {
-    PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
-                      platform::errors::AlreadyExists(
-                          "DftiDescriptor has already been initialized."));
-
-    DFTI_DESCRIPTOR* raw_desc;
-    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
-        &raw_desc, precision, signal_type, signal_ndim, sizes));
-    desc_.reset(raw_desc);
-  }
-
-  DFTI_DESCRIPTOR* get() const {
-    DFTI_DESCRIPTOR* raw_desc = desc_.get();
-    PADDLE_ENFORCE_NOT_NULL(raw_desc,
-                            platform::errors::PreconditionNotMet(
-                                "DFTI DESCRIPTOR has not been initialized."));
-    return raw_desc;
-  }
-
- private:
-  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
-};
-
-DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
-                             const framework::proto::VarType::Type& out_dtype,
-                             const framework::DDim& in_strides,
-                             const framework::DDim& out_strides,
-                             const std::vector<int>& signal_sizes,
-                             FFTNormMode normalization, bool forward) {
-  const DFTI_CONFIG_VALUE precision = [&] {
-    switch (in_dtype) {
-      case framework::proto::VarType::FP32:
-        return DFTI_SINGLE;
-      case framework::proto::VarType::COMPLEX64:
-        return DFTI_SINGLE;
-      case framework::proto::VarType::FP64:
-        return DFTI_DOUBLE;
-      case framework::proto::VarType::COMPLEX128:
-        return DFTI_DOUBLE;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid input datatype (%s), input data type should be FP32, "
-            "FP64, COMPLEX64 or COMPLEX128.",
-            framework::DataTypeToString(in_dtype)));
-    }
-  }();
-
-  // C2C, R2C, C2R
-  const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
-  const DFTI_CONFIG_VALUE domain =
-      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
-
-  DftiDescriptor descriptor;
-  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
-  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
-  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
-
-  // placement inplace or not inplace
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
-                                            DFTI_NOT_INPLACE));
-
-  // number of transformations
-  const MKL_LONG batch_size = fft_sizes[0];
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
-
-  // input & output distance
-  const MKL_LONG idist = in_strides[0];
-  const MKL_LONG odist = out_strides[0];
-  MKL_DFTI_CHECK(
-      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
-                                            DFTI_OUTPUT_DISTANCE, odist));
-
-  // input & output stride
-  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
-  std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
-  for (MKL_LONG i = 1; i <= signal_ndim; i++) {
-    mkl_in_stride[i] = in_strides[i];
-    mkl_out_stride[i] = out_strides[i];
-  }
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
-
-  // conjugate even storage
-  if (!(fft_type == FFTTransformType::C2C)) {
-    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-        descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
-  }
-
-  MKL_LONG signal_numel =
-      std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
-                      std::multiplies<MKL_LONG>());
-  if (normalization != FFTNormMode::none) {
-    const double scale =
-        ((normalization == FFTNormMode::by_sqrt_n)
-             ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
-             : 1.0 / static_cast<double>(signal_numel));
-    const auto scale_direction = [&]() {
-      if (fft_type == FFTTransformType::R2C ||
-          (fft_type == FFTTransformType::C2C && forward)) {
-        return DFTI_FORWARD_SCALE;
-      } else {
-        // (fft_type == FFTTransformType::C2R ||
-        //          (fft_type == FFTTransformType::C2C && !forward))
-        return DFTI_BACKWARD_SCALE;
-      }
-    }();
-    MKL_DFTI_CHECK(
-        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
-  }
-
-  // commit the descriptor
-  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
-  return descriptor;
-}
-
-// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
-template <typename DeviceContext, typename Ti, typename To>
-void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
-              const std::vector<int64_t>& axes, FFTNormMode normalization,
-              bool forward) {
-  const framework::DDim& in_sizes = x->dims();
-  const int ndim = in_sizes.size();
-  const int signal_ndim = axes.size();
-  const int batch_ndim = ndim - signal_ndim;
-  const framework::DDim& out_sizes = out->dims();
-
-  // make a dim permutation
-  std::vector<int> dim_permute(ndim);
-  std::iota(dim_permute.begin(), dim_permute.end(), 0);
-  std::vector<bool> is_transformed_dim(ndim, false);
-  for (const auto& d : axes) {
-    is_transformed_dim[d] = true;
-  }
-  const auto batch_end =
-      std::partition(dim_permute.begin(), dim_permute.end(),
-                     [&](size_t axis) { return !is_transformed_dim[axis]; });
-  std::copy(axes.cbegin(), axes.cend(), batch_end);
-
-  // transpose input according to that permutation
-  framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
-  std::vector<int64_t> transposed_input_shape_ =
-      phi::vectorize(transposed_input_shape);
-  framework::Tensor transposed_input;
-  transposed_input.Resize(transposed_input_shape);
-  const auto place = ctx.GetPlace();
-  transposed_input.mutable_data<Ti>(place);
-  TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
-                                               dim_permute);
-
-  // make an collapsed input: collapse batch axes for input
-  const int batch_size = std::accumulate(
-      transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
-      1L, std::multiplies<int64_t>());
-  std::vector<int> collapsed_input_shape_(1 + signal_ndim);
-  collapsed_input_shape_[0] = batch_size;
-  std::copy(transposed_input_shape_.begin() + batch_ndim,
-            transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
-  const framework::DDim collapsed_input_shape =
-      phi::make_ddim(collapsed_input_shape_);
-  transposed_input.Resize(collapsed_input_shape);
-  framework::Tensor& collapsed_input = transposed_input;
-
-  // make a collapsed output
-  std::vector<int> collapsed_output_shape_(1 + signal_ndim);
-  collapsed_output_shape_[0] = batch_size;
-  for (int i = 0; i < signal_ndim; i++) {
-    collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
-  }
-  const framework::DDim collapsed_output_shape =
-      phi::make_ddim(collapsed_output_shape_);
-  framework::Tensor collapsed_output;
-  collapsed_output.Resize(collapsed_output_shape);
-  collapsed_output.mutable_data(place, out->type());
-
-  // signal sizes
-  std::vector<int> signal_sizes(1 + signal_ndim);
-  signal_sizes[0] = batch_size;
-  for (int i = 0; i < signal_ndim; i++) {
-    signal_sizes[1 + i] =
-        std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
-  }
-
-  // input & output stride
-  const framework::DDim input_stride = phi::stride(collapsed_input_shape);
-  const framework::DDim output_stride = phi::stride(collapsed_output_shape);
-
-  // make a DFTI_DESCRIPTOR
-  DftiDescriptor desc =
-      _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
-                    framework::TransToProtoVarType(out->dtype()), input_stride,
-                    output_stride, signal_sizes, normalization, forward);
-
-  const FFTTransformType fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
-                          framework::TransToProtoVarType(out->type()));
-  if (fft_type == FFTTransformType::C2R && forward) {
-    framework::Tensor collapsed_input_conj(collapsed_input.dtype());
-    collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
-                                          ctx.GetPlace());
-    // conjugate the input
-    platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
-    phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
-                                        collapsed_input.numel(),
-                                        collapsed_input_conj.data<Ti>());
-    for_range(functor);
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
-        desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    framework::Tensor collapsed_output_conj(collapsed_output.dtype());
-    collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
-                                           ctx.GetPlace());
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
-        desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
-    // conjugate the output
-    platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
-    phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
-                                        collapsed_output.numel(),
-                                        collapsed_output.data<To>());
-    for_range(functor);
-  } else {
-    if (forward) {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
-          desc.get(), collapsed_input.data(), collapsed_output.data()));
-    } else {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
-          desc.get(), collapsed_input.data(), collapsed_output.data()));
-    }
-  }
-
-  // resize for the collapsed output
-  framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
-  collapsed_output.Resize(transposed_output_shape);
-  framework::Tensor& transposed_output = collapsed_output;
-
-  // reverse the transposition
-  std::vector<int> reverse_dim_permute(ndim);
-  for (int i = 0; i < ndim; i++) {
-    reverse_dim_permute[dim_permute[i]] = i;
-  }
-  TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
-                                               out, reverse_dim_permute);
-}
-}  // anonymous namespace
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
-                                                 normalization, forward);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
-                                                 normalization, forward);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    if (axes.size() > 1) {
-      const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
-      Tensor temp;
-      temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
-
-      FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
-      c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
-
-      const std::vector<int64_t> new_axes{axes.back()};
-      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
-                                                   normalization, forward);
-    } else {
-      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
-                                                   normalization, forward);
-    }
-  }
-};
-
-#elif defined(PADDLE_WITH_POCKETFFT)
-
-namespace {
-template <typename T>
-T compute_factor(int64_t size, FFTNormMode normalization) {
-  constexpr auto one = static_cast<T>(1);
-  switch (normalization) {
-    case FFTNormMode::none:
-      return one;
-    case FFTNormMode::by_n:
-      return one / static_cast<T>(size);
-    case FFTNormMode::by_sqrt_n:
-      return one / std::sqrt(static_cast<T>(size));
-  }
-  PADDLE_THROW(
-      platform::errors::InvalidArgument("Unsupported normalization type"));
-}
-}  // anonymous namespace
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    using R = typename Ti::value_type;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    const int64_t data_size = sizeof(C);
-    std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                   [&](std::ptrdiff_t s) { return s * data_size; });
-
-    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
-    auto* out_data = reinterpret_cast<C*>(out->data<To>());
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= in_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
-                   out_data, factor);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    using R = Ti;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    {
-      const int64_t data_size = sizeof(R);
-      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
-    std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
-    {
-      const int64_t data_size = sizeof(C);
-      std::transform(out_strides.begin(), out_strides.end(),
-                     out_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto* in_data = x->data<R>();
-    auto* out_data = reinterpret_cast<C*>(out->data<To>());
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet normalization factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= in_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
-                   out_data, factor);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    using R = To;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    {
-      const int64_t data_size = sizeof(C);
-      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
-    std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
-    {
-      const int64_t data_size = sizeof(R);
-      std::transform(out_strides.begin(), out_strides.end(),
-                     out_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
-    auto* out_data = out->data<R>();
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet normalization factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= out_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
-                   out_data, factor);
-  }
-};
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index b7b6b5302afd637b02f76492e04ea5a5f4209d6a..b7fb83d9d5cefc8e107862989fee8af4003de8e7 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -8,496 +8,9 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-#include <functional>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
 
-#include "paddle/fluid/operators/conj_op.h"
-#include "paddle/fluid/operators/spectral_helper.h"
+#include "paddle/fluid/operators/spectral_op.cu.h"
 #include "paddle/fluid/operators/spectral_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-// Calculates the normalization constant
-double fft_normalization_scale(FFTNormMode normalization,
-                               const std::vector<int64_t>& sizes,
-                               const std::vector<int64_t>& dims) {
-  // auto norm = static_cast<fft_norm_mode>(normalization);
-  if (normalization == FFTNormMode::none) {
-    return static_cast<double>(1.0);
-  }
-
-  int64_t signal_numel = 1;
-  for (auto dim : dims) {
-    signal_numel *= sizes[dim];
-  }
-  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
-                                 ? std::sqrt(signal_numel)
-                                 : static_cast<double>(signal_numel);
-  return static_cast<double>(1.0 / scale_denom);
-}
-
-template <typename DeviceContext, typename T>
-void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
-                        FFTNormMode normalization,
-                        const std::vector<int64_t>& sizes,
-                        const std::vector<int64_t>& axes) {
-  double scale = fft_normalization_scale(normalization, sizes, axes);
-  if (scale != 1.0) {
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto dev = ctx.eigen_device();
-    EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
-                                          static_cast<T>(scale),
-                                          static_cast<T>(0), false);
-  } else {
-    framework::TensorCopy(*in, ctx.GetPlace(), out);
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA)
-FFTConfigKey create_fft_configkey(const framework::Tensor& input,
-                                  const framework::Tensor& output,
-                                  int signal_ndim) {
-  // Create the transform plan (either from cache or locally)
-  const auto value_type =
-      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
-          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
-          : framework::TransToProtoVarType(input.dtype());
-  auto fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
-                          framework::TransToProtoVarType(output.dtype()));
-  // signal sizes
-  std::vector<int64_t> signal_size(signal_ndim + 1);
-
-  signal_size[0] = input.dims()[0];
-  for (int64_t i = 1; i <= signal_ndim; ++i) {
-    auto in_size = input.dims()[i];
-    auto out_size = output.dims()[i];
-    signal_size[i] = std::max(in_size, out_size);
-  }
-  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
-                   signal_size, fft_type, value_type);
-  return key;
-}
-
-// Execute a pre-planned transform
-static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
-                                void* out_data, bool forward) {
-  auto& plan = config.plan();
-
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
-      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
-}
-
-template <typename DeviceContext, typename Ti, typename To>
-void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
-                     framework::Tensor* input, framework::Tensor* output,
-                     bool forward) {
-  // execute transform plan
-  auto fft_type = config.transform_type();
-  if (fft_type == FFTTransformType::C2R && forward) {
-    forward = false;
-    framework::Tensor input_conj(input->type());
-    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
-                                        input_conj.data<Ti>());
-    for_range(functor);
-    exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    forward = true;
-    framework::Tensor out_conj(output->type());
-    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
-
-    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
-                                        output->data<To>());
-    for_range(functor);
-  } else {
-    exec_cufft_plan_raw(config, input->data(), output->data(), forward);
-  }
-}
-
-#elif defined(PADDLE_WITH_HIP)
-
-FFTConfigKey create_fft_configkey(const framework::Tensor& input,
-                                  const framework::Tensor& output,
-                                  int signal_ndim) {
-  // Create the transform plan (either from cache or locally)
-  const auto value_type =
-      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
-          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
-          : framework::TransToProtoVarType(input.dtype());
-  auto fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
-                          framework::TransToProtoVarType(output.type()));
-  // signal sizes
-  std::vector<int64_t> signal_size(signal_ndim + 1);
-
-  signal_size[0] = input.dims()[0];
-  for (int64_t i = 1; i <= signal_ndim; ++i) {
-    auto in_size = input.dims()[i];
-    auto out_size = output.dims()[i];
-    signal_size[i] = std::max(in_size, out_size);
-  }
-  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
-                   signal_size, fft_type, value_type);
-  return key;
-}
-
-// Execute a pre-planned transform
-static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
-                                 void* out_data, bool forward) {
-  auto& plan = config.plan();
-
-  auto value_type = config.data_type();
-  if (value_type == framework::proto::VarType::FP32) {
-    switch (config.transform_type()) {
-      case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
-            plan, static_cast<hipfftComplex*>(in_data),
-            static_cast<hipfftComplex*>(out_data),
-            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
-            plan, static_cast<hipfftReal*>(in_data),
-            static_cast<hipfftComplex*>(out_data)));
-        return;
-      }
-      case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
-            plan, static_cast<hipfftComplex*>(in_data),
-            static_cast<hipfftReal*>(out_data)));
-        return;
-      }
-    }
-  } else if (value_type == framework::proto::VarType::FP64) {
-    switch (config.transform_type()) {
-      case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
-            plan, static_cast<hipfftDoubleComplex*>(in_data),
-            static_cast<hipfftDoubleComplex*>(out_data),
-            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
-            plan, static_cast<hipfftDoubleReal*>(in_data),
-            static_cast<hipfftDoubleComplex*>(out_data)));
-        return;
-      }
-      case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
-            plan, static_cast<hipfftDoubleComplex*>(in_data),
-            static_cast<hipfftDoubleReal*>(out_data)));
-        return;
-      }
-    }
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "hipFFT only support transforms of type float32 and float64"));
-}
-
-template <typename DeviceContext, typename Ti, typename To>
-void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
-                      framework::Tensor* input, framework::Tensor* output,
-                      bool forward) {
-  auto fft_type = config.transform_type();
-  if (fft_type == FFTTransformType::C2R && forward) {
-    forward = false;
-    framework::Tensor input_conj(input->type());
-    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
-                                        input_conj.data<Ti>());
-    for_range(functor);
-    exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    forward = true;
-    framework::Tensor out_conj(output->type());
-    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
-
-    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
-                                        output->data<To>());
-    for_range(functor);
-  } else {
-    exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
-  }
-}
-
-#endif
-
-// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
-// onesided c2r)
-template <typename DeviceContext, typename Ti, typename To>
-void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
-              const std::vector<int64_t>& dim, bool forward) {
-  const auto x_dims = phi::vectorize(X->dims());
-  const int64_t ndim = static_cast<int64_t>(X->dims().size());
-  auto tensor_place = ctx.GetPlace();
-
-  // make a dim permutation
-  std::vector<int> dim_permute(ndim);
-  std::iota(dim_permute.begin(), dim_permute.end(), int{0});
-  std::vector<bool> is_transformed_dim(ndim);
-  for (const auto& d : dim) {
-    is_transformed_dim[d] = true;
-  }
-  auto batch_end =
-      std::partition(dim_permute.begin(), dim_permute.end(),
-                     [&](int64_t d) { return !is_transformed_dim[d]; });
-  std::sort(dim_permute.begin(), batch_end);
-  std::copy(dim.cbegin(), dim.cend(), batch_end);
-
-  // transpose input according to dim permutation
-  auto transposed_input_shape = X->dims().transpose(dim_permute);
-  framework::Tensor transposed_input;
-  transposed_input.Resize(transposed_input_shape);
-  transposed_input.mutable_data<Ti>(tensor_place);
-  TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
-                                  dim_permute);
-
-  // Reshape batch dimensions into a single dimension
-  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
-  std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
-
-  auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
-  const int64_t batch_dims = ndim - signal_ndim;
-  auto batch_size =
-      std::accumulate(transposed_input_shape_.begin(),
-                      transposed_input_shape_.begin() + batch_dims,
-                      static_cast<int>(1), std::multiplies<int>());
-  collapsed_input_shape[0] = batch_size;
-
-  std::copy(transposed_input_shape_.begin() + batch_dims,
-            transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
-
-  framework::Tensor& collapsed_input = transposed_input;
-  collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
-
-  // make a collpased output
-  const auto out_dims = phi::vectorize(out->dims());
-  std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
-  collapsed_output_shape[0] = batch_size;
-  for (size_t i = 0; i < dim.size(); ++i) {
-    collapsed_output_shape[i + 1] = out_dims[dim[i]];
-  }
-  framework::Tensor collapsed_output;
-  collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
-  collapsed_output.mutable_data<To>(tensor_place);
-
-  FFTConfig* config = nullptr;
-
-#if defined(PADDLE_WITH_CUDA)
-  std::unique_ptr<FFTConfig> config_ = nullptr;
-  // create plan
-  FFTConfigKey key =
-      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
-  bool using_cache = false;
-#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
-  using_cache = true;
-#endif
-
-  if (using_cache) {
-    const int64_t device_id = static_cast<int64_t>(
-        reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
-            ->GetDeviceId());
-    FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
-    std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-    guard.lock();
-    config = &(plan_cache.lookup(key));
-  } else {
-    config_ = std::make_unique<FFTConfig>(key);
-    config = config_.get();
-  }
-
-  // prepare cufft for execution
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
-  framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
-      config->plan(), workspace_tensor.data<To>()));
-  // execute transform plan
-  exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
-                                         &collapsed_output, forward);
-
-#elif defined(PADDLE_WITH_HIP)
-  // create plan
-  FFTConfigKey key =
-      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
-  const int64_t device_id = static_cast<int64_t>(
-      reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
-          ->GetDeviceId());
-  FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
-  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-  guard.lock();
-  config = &(plan_cache.lookup(key));
-
-  // prepare cufft for execution
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
-  framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
-      config->plan(), workspace_tensor.data<To>()));
-  // execute transform plan
-  exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
-                                          &collapsed_output, forward);
-#endif
-
-  // Inverting output by reshape and transpose to original batch and dimension
-  auto transposed_out_shape = out->dims().transpose(dim_permute);
-
-  collapsed_output.Resize(transposed_out_shape);
-  auto& transposed_output = collapsed_output;
-
-  std::vector<int> reverse_dim_permute(ndim);
-  for (size_t i = 0; i < ndim; i++) {
-    reverse_dim_permute[dim_permute[i]] = i;
-  }
-
-  TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
-                                  reverse_dim_permute);
-}
-
-}  // anonymous namespace
-
-// Use the optimized path to perform single R2C or C2R if transformation dim is
-// supported by cuFFT
-bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
-  // For performance reason, when axes starts with (0, 1), do not use the
-  // optimized path.
-  if (axes.size() > kMaxFFTNdim ||
-      (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
-    return false;
-  } else {
-    return true;
-  }
-}
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    if (axes.empty()) {
-      framework::TensorCopy(*X, ctx.GetPlace(), out);
-      return;
-    }
-
-    framework::Tensor* p_out = out;
-    std::vector<int64_t> out_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> working_axes(axes.begin(), axes.end());
-    std::vector<int64_t> first_dims;
-    size_t max_dims;
-    framework::Tensor working_tensor;
-    working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-    framework::Tensor* p_working_tensor = &working_tensor;
-    framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
-
-    while (true) {
-      max_dims =
-          std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
-      first_dims.assign(working_axes.end() - max_dims, working_axes.end());
-
-      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
-                                                    p_out, first_dims, forward);
-      working_axes.resize(working_axes.size() - max_dims);
-      first_dims.clear();
-
-      if (working_axes.empty()) {
-        break;
-      }
-
-      std::swap(p_out, p_working_tensor);
-    }
-    exec_normalization<platform::CUDADeviceContext, To>(
-        ctx, p_out, out, normalization, out_dims, axes);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    std::vector<int64_t> in_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
-
-    if (use_optimized_fft_path(axes)) {
-      framework::Tensor x_copy(X->type());
-      x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-      framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
-      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
-                                                    forward);
-    } else {
-      framework::Tensor temp_tensor;
-      temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-      const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
-
-      FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
-      c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
-
-      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
-                                                    {axes.back()}, forward);
-    }
-    exec_normalization<platform::CUDADeviceContext, To>(
-        ctx, out, out, normalization, out_dims, axes);
-  }
-};
-
-// n dimension real to complex FFT use cufft lib
-template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    // Step1: R2C transform on the last dimension
-    framework::Tensor* r2c_out = out;
-    const std::vector<int64_t> last_dim{axes.back()};
-    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
-    exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
-                                                  forward);
-
-    // Step2: C2C transform on the remaining dimension
-    framework::Tensor c2c_out;
-    if (axes.size() > 1) {
-      c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
-      std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
-      FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
-      fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
-                   forward);
-    }
-
-    const auto in_sizes = phi::vectorize(X->dims());
-    framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
-    exec_normalization<platform::CUDADeviceContext, To>(
-        ctx, norm_tensor, out, normalization, in_sizes, axes);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdb0e0d2848846fba4e369d5e4f2ef48f263bd29
--- /dev/null
+++ b/paddle/fluid/operators/spectral_op.cu.h
@@ -0,0 +1,944 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/operators/conj_op.h"
+#include "paddle/fluid/operators/spectral_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hipfft.h"
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/cufft.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using ScalarType = framework::proto::VarType::Type;
+const int64_t kMaxFFTNdim = 3;
+const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
+// This struct is used to easily compute hashes of the
+// parameters. It will be the **key** to the plan cache.
+struct FFTConfigKey {
+  // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
+  int64_t signal_ndim_;
+  // These include additional batch dimension as well.
+  int64_t sizes_[kMaxDataNdim];
+  int64_t input_shape_[kMaxDataNdim];
+  int64_t output_shape_[kMaxDataNdim];
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+
+  FFTConfigKey() = default;
+
+  FFTConfigKey(const std::vector<int64_t>& in_shape,
+               const std::vector<int64_t>& out_shape,
+               const std::vector<int64_t>& signal_size,
+               FFTTransformType fft_type, ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_size.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+
+    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
+    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
+    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
+  }
+};
+
+#if defined(PADDLE_WITH_CUDA)
+// An RAII encapsulation of cuFFTHandle
+class CuFFTHandle {
+  ::cufftHandle handle_;
+
+ public:
+  CuFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
+  }
+
+  CuFFTHandle(const CuFFTHandle& other) = delete;
+  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
+
+  CuFFTHandle(CuFFTHandle&& other) = delete;
+  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
+
+  ::cufftHandle& get() { return handle_; }
+  const ::cufftHandle& get() const { return handle_; }
+
+  ~CuFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
+  }
+};
+
+using plan_size_type = long long int;  // NOLINT
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class FFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+
+  // sizes are full signal, including batch size and always two-sided
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = has_complex_input(fft_type);
+    const auto complex_output = has_complex_output(fft_type);
+    if (dtype == framework::proto::VarType::FP32) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == framework::proto::VarType::FP64) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == framework::proto::VarType::FP16) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "cuFFT only support transforms of type float16, float32 and "
+          "float64"));
+    }
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+
+    ws_size = ws_size_t;
+  }
+
+  FFTConfig(const FFTConfig& other) = delete;
+  FFTConfig& operator=(const FFTConfig& other) = delete;
+
+  FFTConfig(FFTConfig&& other) = delete;
+  FFTConfig& operator=(FFTConfig&& other) = delete;
+
+  const cufftHandle& plan() const { return plan_ptr.get(); }
+
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+
+ private:
+  CuFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+
+#elif defined(PADDLE_WITH_HIP)
+// An RAII encapsulation of cuFFTHandle
+class HIPFFTHandle {
+  ::hipfftHandle handle_;
+
+ public:
+  HIPFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+  }
+
+  HIPFFTHandle(const HIPFFTHandle& other) = delete;
+  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
+
+  HIPFFTHandle(HIPFFTHandle&& other) = delete;
+  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
+
+  ::hipfftHandle& get() { return handle_; }
+  const ::hipfftHandle& get() const { return handle_; }
+
+  ~HIPFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+  }
+};
+using plan_size_type = int;
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class FFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+
+  // sizes are full signal, including batch size and always two-sided
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+
+    hipfftType exec_type = [&] {
+      if (dtype == framework::proto::VarType::FP32) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_C2C;
+          case FFTTransformType::R2C:
+            return HIPFFT_R2C;
+          case FFTTransformType::C2R:
+            return HIPFFT_C2R;
+        }
+      } else if (dtype == framework::proto::VarType::FP64) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_Z2Z;
+          case FFTTransformType::R2C:
+            return HIPFFT_D2Z;
+          case FFTTransformType::C2R:
+            return HIPFFT_Z2D;
+        }
+      }
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "hipFFT only support transforms of type float32 and float64"));
+    }();
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
+        batch, &ws_size_t));
+
+    ws_size = ws_size_t;
+  }
+
+  const hipfftHandle& plan() const { return plan_ptr.get(); }
+
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+
+ private:
+  HIPFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+#endif
+
+// Hashing machinery for Key
+// Fowler–Noll–Vo hash function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Key>
+struct KeyHash {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when hashing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  size_t operator()(const Key& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return static_cast<size_t>(value);
+  }
+};
+
+template <typename Key>
+struct KeyEqual {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when comparing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  bool operator()(const Key& a, const Key& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
+  }
+};
+
+#if CUDA_VERSION < 10000
+// Note that the max plan number for CUDA version < 10 has to be 1023
+// due to a bug that fails on the 1024th plan
+constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
+// The default max cache size chosen for CUDA version > 10 is arbitrary.
+// This number puts a limit on how big of a plan cache should we maintain by
+// default. Users can always configure it via cufft_set_plan_cache_max_size.
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
+                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
+                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class FFTConfigCache {
+ public:
+  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
+  using map_t = typename std::unordered_map<
+      std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
+      KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+
+  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
+
+  FFTConfigCache(const FFTConfigCache& other) = delete;
+  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
+
+  FFTConfigCache(FFTConfigCache&& other) noexcept
+      : _usage_list(std::move(other._usage_list)),
+        _cache_map(std::move(other._cache_map)),
+        _max_size(other._max_size) {}
+
+  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  FFTConfig& lookup(FFTConfigKey params) {
+    PADDLE_ENFORCE_GT(_max_size, 0,
+                      platform::errors::InvalidArgument(
+                          "The max size of FFTConfigCache must be great than 0,"
+                          "But received is [%d]",
+                          _max_size));
+
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                              std::forward_as_tuple(params),
+                              std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                       std::forward_as_tuple(kv_it->first),
+                       std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+  std::mutex mutex;
+
+ private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    PADDLE_ENFORCE_GE(
+        new_size, 0,
+        platform::errors::InvalidArgument(
+            "cuFFT plan cache size must be non-negative, But received is [%d]",
+            new_size));
+    PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
+                      platform::errors::InvalidArgument(
+                          "cuFFT plan cache size can not be larger than [%d], "
+                          "But received is [%d]",
+                          CUFFT_MAX_PLAN_NUM, new_size));
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
+static std::mutex plan_caches_mutex;
+
+static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
+  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+
+  if (device_index >= plan_caches.size()) {
+    plan_caches.resize(device_index + 1);
+  }
+
+  if (!plan_caches[device_index]) {
+    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
+  }
+
+  return *plan_caches[device_index];
+}
+
+// Calculates the normalization constant
+static double fft_normalization_scale(FFTNormMode normalization,
+                                      const std::vector<int64_t>& sizes,
+                                      const std::vector<int64_t>& dims) {
+  // auto norm = static_cast<fft_norm_mode>(normalization);
+  if (normalization == FFTNormMode::none) {
+    return static_cast<double>(1.0);
+  }
+
+  int64_t signal_numel = 1;
+  for (auto dim : dims) {
+    signal_numel *= sizes[dim];
+  }
+  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
+                                 ? std::sqrt(signal_numel)
+                                 : static_cast<double>(signal_numel);
+  return static_cast<double>(1.0 / scale_denom);
+}
+
+template <typename DeviceContext, typename T>
+void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
+                        FFTNormMode normalization,
+                        const std::vector<int64_t>& sizes,
+                        const std::vector<int64_t>& axes) {
+  double scale = fft_normalization_scale(normalization, sizes, axes);
+  if (scale != 1.0) {
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto dev = ctx.eigen_device();
+    EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
+                                          static_cast<T>(scale),
+                                          static_cast<T>(0), false);
+  } else {
+    framework::TensorCopy(*in, ctx.GetPlace(), out);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA)
+static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
+                                         const framework::Tensor& output,
+                                         int signal_ndim) {
+  // Create the transform plan (either from cache or locally)
+  const auto value_type =
+      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
+          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
+          : framework::TransToProtoVarType(input.dtype());
+  auto fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
+                          framework::TransToProtoVarType(output.dtype()));
+  // signal sizes
+  std::vector<int64_t> signal_size(signal_ndim + 1);
+
+  signal_size[0] = input.dims()[0];
+  for (int64_t i = 1; i <= signal_ndim; ++i) {
+    auto in_size = input.dims()[i];
+    auto out_size = output.dims()[i];
+    signal_size[i] = std::max(in_size, out_size);
+  }
+  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
+                   signal_size, fft_type, value_type);
+  return key;
+}
+
+// Execute a pre-planned transform
+static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
+                                void* out_data, bool forward) {
+  auto& plan = config.plan();
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
+      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
+}
+
+template <typename DeviceContext, typename Ti, typename To>
+void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
+                     framework::Tensor* input, framework::Tensor* output,
+                     bool forward) {
+  // execute transform plan
+  auto fft_type = config.transform_type();
+  if (fft_type == FFTTransformType::C2R && forward) {
+    forward = false;
+    framework::Tensor input_conj(input->type());
+    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
+    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                        input_conj.data<Ti>());
+    for_range(functor);
+    exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    forward = true;
+    framework::Tensor out_conj(output->type());
+    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
+    exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
+
+    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
+    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                        output->data<To>());
+    for_range(functor);
+  } else {
+    exec_cufft_plan_raw(config, input->data(), output->data(), forward);
+  }
+}
+
+#elif defined(PADDLE_WITH_HIP)
+
+static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
+                                         const framework::Tensor& output,
+                                         int signal_ndim) {
+  // Create the transform plan (either from cache or locally)
+  const auto value_type =
+      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
+          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
+          : framework::TransToProtoVarType(input.dtype());
+  auto fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
+                          framework::TransToProtoVarType(output.type()));
+  // signal sizes
+  std::vector<int64_t> signal_size(signal_ndim + 1);
+
+  signal_size[0] = input.dims()[0];
+  for (int64_t i = 1; i <= signal_ndim; ++i) {
+    auto in_size = input.dims()[i];
+    auto out_size = output.dims()[i];
+    signal_size[i] = std::max(in_size, out_size);
+  }
+  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
+                   signal_size, fft_type, value_type);
+  return key;
+}
+
+// Execute a pre-planned transform
+static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
+                                 void* out_data, bool forward) {
+  auto& plan = config.plan();
+
+  auto value_type = config.data_type();
+  if (value_type == framework::proto::VarType::FP32) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
+            plan, static_cast<hipfftComplex*>(in_data),
+            static_cast<hipfftComplex*>(out_data),
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
+            plan, static_cast<hipfftReal*>(in_data),
+            static_cast<hipfftComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
+            plan, static_cast<hipfftComplex*>(in_data),
+            static_cast<hipfftReal*>(out_data)));
+        return;
+      }
+    }
+  } else if (value_type == framework::proto::VarType::FP64) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
+            plan, static_cast<hipfftDoubleComplex*>(in_data),
+            static_cast<hipfftDoubleComplex*>(out_data),
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
+            plan, static_cast<hipfftDoubleReal*>(in_data),
+            static_cast<hipfftDoubleComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
+            plan, static_cast<hipfftDoubleComplex*>(in_data),
+            static_cast<hipfftDoubleReal*>(out_data)));
+        return;
+      }
+    }
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "hipFFT only support transforms of type float32 and float64"));
+}
+
+template <typename DeviceContext, typename Ti, typename To>
+void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
+                      framework::Tensor* input, framework::Tensor* output,
+                      bool forward) {
+  auto fft_type = config.transform_type();
+  if (fft_type == FFTTransformType::C2R && forward) {
+    forward = false;
+    framework::Tensor input_conj(input->type());
+    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
+    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                        input_conj.data<Ti>());
+    for_range(functor);
+    exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    forward = true;
+    framework::Tensor out_conj(output->type());
+    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
+    exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
+
+    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
+    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                        output->data<To>());
+    for_range(functor);
+  } else {
+    exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
+  }
+}
+
+#endif
+
+// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
+// onesided c2r)
+template <typename DeviceContext, typename Ti, typename To>
+void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
+              const std::vector<int64_t>& dim, bool forward) {
+  const auto x_dims = phi::vectorize(X->dims());
+  const int64_t ndim = static_cast<int64_t>(X->dims().size());
+  auto tensor_place = ctx.GetPlace();
+
+  // make a dim permutation
+  std::vector<int> dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), int{0});
+  std::vector<bool> is_transformed_dim(ndim);
+  for (const auto& d : dim) {
+    is_transformed_dim[d] = true;
+  }
+  auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(),
+                     [&](int64_t d) { return !is_transformed_dim[d]; });
+  std::sort(dim_permute.begin(), batch_end);
+  std::copy(dim.cbegin(), dim.cend(), batch_end);
+
+  // transpose input according to dim permutation
+  auto transposed_input_shape = X->dims().transpose(dim_permute);
+  framework::Tensor transposed_input;
+  transposed_input.Resize(transposed_input_shape);
+  transposed_input.mutable_data<Ti>(tensor_place);
+  TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
+                                  dim_permute);
+
+  // Reshape batch dimensions into a single dimension
+  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
+  std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
+
+  auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
+  const int64_t batch_dims = ndim - signal_ndim;
+  auto batch_size =
+      std::accumulate(transposed_input_shape_.begin(),
+                      transposed_input_shape_.begin() + batch_dims,
+                      static_cast<int>(1), std::multiplies<int>());
+  collapsed_input_shape[0] = batch_size;
+
+  std::copy(transposed_input_shape_.begin() + batch_dims,
+            transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
+
+  framework::Tensor& collapsed_input = transposed_input;
+  collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
+
+  // make a collpased output
+  const auto out_dims = phi::vectorize(out->dims());
+  std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
+  collapsed_output_shape[0] = batch_size;
+  for (size_t i = 0; i < dim.size(); ++i) {
+    collapsed_output_shape[i + 1] = out_dims[dim[i]];
+  }
+  framework::Tensor collapsed_output;
+  collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
+  collapsed_output.mutable_data<To>(tensor_place);
+
+  FFTConfig* config = nullptr;
+
+#if defined(PADDLE_WITH_CUDA)
+  std::unique_ptr<FFTConfig> config_ = nullptr;
+  // create plan
+  FFTConfigKey key =
+      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
+  bool using_cache = false;
+#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
+  using_cache = true;
+#endif
+
+  if (using_cache) {
+    const int64_t device_id = static_cast<int64_t>(
+        reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
+            ->GetDeviceId());
+    FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
+    std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+    guard.lock();
+    config = &(plan_cache.lookup(key));
+  } else {
+    config_ = std::make_unique<FFTConfig>(key);
+    config = config_.get();
+  }
+
+  // prepare cufft for execution
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
+  framework::Tensor workspace_tensor;
+  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
+      config->plan(), workspace_tensor.data<To>()));
+  // execute transform plan
+  exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
+                                         &collapsed_output, forward);
+
+#elif defined(PADDLE_WITH_HIP)
+  // create plan
+  FFTConfigKey key =
+      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
+  const int64_t device_id = static_cast<int64_t>(
+      reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
+          ->GetDeviceId());
+  FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
+  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+  guard.lock();
+  config = &(plan_cache.lookup(key));
+
+  // prepare cufft for execution
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
+  framework::Tensor workspace_tensor;
+  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
+      config->plan(), workspace_tensor.data<To>()));
+  // execute transform plan
+  exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
+                                          &collapsed_output, forward);
+#endif
+
+  // Inverting output by reshape and transpose to original batch and dimension
+  auto transposed_out_shape = out->dims().transpose(dim_permute);
+
+  collapsed_output.Resize(transposed_out_shape);
+  auto& transposed_output = collapsed_output;
+
+  std::vector<int> reverse_dim_permute(ndim);
+  for (size_t i = 0; i < ndim; i++) {
+    reverse_dim_permute[dim_permute[i]] = i;
+  }
+
+  TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
+                                  reverse_dim_permute);
+}
+
+// Use the optimized path to perform single R2C or C2R if transformation dim is
+// supported by cuFFT
+static bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
+  // For performance reason, when axes starts with (0, 1), do not use the
+  // optimized path.
+  if (axes.size() > kMaxFFTNdim ||
+      (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    if (axes.empty()) {
+      framework::TensorCopy(*X, ctx.GetPlace(), out);
+      return;
+    }
+
+    framework::Tensor* p_out = out;
+    std::vector<int64_t> out_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> working_axes(axes.begin(), axes.end());
+    std::vector<int64_t> first_dims;
+    size_t max_dims;
+    framework::Tensor working_tensor;
+    working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+    framework::Tensor* p_working_tensor = &working_tensor;
+    framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
+
+    while (true) {
+      max_dims =
+          std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
+      first_dims.assign(working_axes.end() - max_dims, working_axes.end());
+
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
+                                                    p_out, first_dims, forward);
+      working_axes.resize(working_axes.size() - max_dims);
+      first_dims.clear();
+
+      if (working_axes.empty()) {
+        break;
+      }
+
+      std::swap(p_out, p_working_tensor);
+    }
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, p_out, out, normalization, out_dims, axes);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    std::vector<int64_t> in_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
+
+    if (use_optimized_fft_path(axes)) {
+      framework::Tensor x_copy(X->type());
+      x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+      framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
+                                                    forward);
+    } else {
+      framework::Tensor temp_tensor;
+      temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+      const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
+
+      FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
+
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
+                                                    {axes.back()}, forward);
+    }
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, out, out, normalization, out_dims, axes);
+  }
+};
+
+// n dimension real to complex FFT use cufft lib
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    // Step1: R2C transform on the last dimension
+    framework::Tensor* r2c_out = out;
+    const std::vector<int64_t> last_dim{axes.back()};
+    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
+    exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
+                                                  forward);
+
+    // Step2: C2C transform on the remaining dimension
+    framework::Tensor c2c_out;
+    if (axes.size() > 1) {
+      c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
+      std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
+      FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
+      fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
+                   forward);
+    }
+
+    const auto in_sizes = phi::vectorize(X->dims());
+    framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, norm_tensor, out, normalization, in_sizes, axes);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
index a60ec5a4df52b8275a17185a63c8a7d27dd8132b..71b54caf5ee79473e349b39130be673bf5a1e6bb 100644
--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -11,8 +11,11 @@
 
 #pragma once
 #define NOMINMAX  // to use std::min std::max correctly on windows
+#include <algorithm>
+#include <functional>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/convert_utils.h"
@@ -23,8 +26,10 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/conj_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/padding.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "thrust/device_vector.h"
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ecbd9edd87dc6c12eda76b76ca9239b79d3aec9c
--- /dev/null
+++ b/paddle/fluid/operators/stft_op.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stft_op.h"
+#include "paddle/fluid/operators/spectral_helper.h"
+
+namespace paddle {
+namespace operators {
+class StftOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "frame");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "frame");
+
+    const int n_fft = ctx->Attrs().Get<int>("n_fft");
+    const int hop_length = ctx->Attrs().Get<int>("hop_length");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const int x_rank = x_dims.size();
+    const bool onesided = ctx->Attrs().Get<bool>("onesided");
+
+    PADDLE_ENFORCE_EQ(
+        x_rank, 2,
+        platform::errors::InvalidArgument(
+            "Input(X) of StftOp should be a tensor with shape [N, T], "
+            "but got rank %s.",
+            x_rank));
+    PADDLE_ENFORCE_GT(
+        hop_length, 0,
+        platform::errors::InvalidArgument(
+            "Attribute(hop_length) should be greater than 0, but got %s.",
+            hop_length));
+
+    int seq_length = x_dims[x_rank - 1];
+    int n_frames = 1 + (seq_length - n_fft) / hop_length;
+
+    PADDLE_ENFORCE_LE(n_fft, seq_length,
+                      platform::errors::InvalidArgument(
+                          "Attribute(frame_length) should be less equal than "
+                          "sequence length, but got (%s) > (%s).",
+                          n_fft, seq_length));
+
+    std::vector<int64_t> output_shape;
+    output_shape.push_back(x_dims[0]);
+    if (onesided) {
+      output_shape.push_back(n_fft / 2 + 1);
+    } else {
+      output_shape.push_back(n_fft);
+    }
+    output_shape.push_back(n_frames);
+
+    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+class StftOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input waveforms with shape (N, T)");
+    AddOutput("Out",
+              "The complex STFT output tensor with shape (N, n_fft, "
+              "num_frames) or (N, n_fft/2 + 1, num_frames)");
+    AddAttr<int>("n_fft", "The number of input samples to perform FFT");
+    AddAttr<int>("hop_length", "Number of samples between adjacent frames");
+    AddAttr<bool>("normalized",
+                  "Control whether to scale the output by 1/sqrt(n_fft)");
+    AddAttr<bool>("onesided",
+                  "Control whether to return half of the FFT output");
+    AddComment(R"DOC(
+      Short-time Fourier transform (STFT).
+    )DOC");
+  }
+};
+
+template <typename T>
+class StftGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("stft_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class StftGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    const auto out_grad_name = framework::GradVarName("Out");
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+                   "stft_grad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "stft_grad");
+
+    const auto x_grad_name = framework::GradVarName("X");
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+                   "stft_grad");
+
+    ctx->ShareDim("X", /*->*/ x_grad_name);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    const auto kernel_dtype = framework::ToRealType(in_dtype);
+    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(stft, ops::StftOp, ops::StftOpMaker,
+                  ops::StftGradOpMaker<paddle::framework::OpDesc>,
+                  ops::StftGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(stft_grad, ops::StftGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    stft, ops::StftKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::StftKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    stft_grad, ops::StftGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::StftGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5272be29c0c14de3b1fa00d23eff303c41528042
--- /dev/null
+++ b/paddle/fluid/operators/stft_op.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/spectral_op.cu.h"
+#include "paddle/fluid/operators/stft_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    stft, ops::StftKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::StftKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    stft_grad, ops::StftGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::StftGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f0746ee143f9b53214b2b3ebb81a571bb008908
--- /dev/null
+++ b/paddle/fluid/operators/stft_op.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/frame_op.h"
+#include "paddle/fluid/operators/spectral_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class StftKernel : public framework::OpKernel<T> {
+ public:
+  /*
+    Batch Signals (N, T) -> Frames (N, n_fft, num_frames) -> FFTR2C -> (N,
+    n_fft/2 + 1, num_frames) or (N, n_fft, num_frames)
+  */
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    const Tensor* x = ctx.Input<Tensor>("X");
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<C>(ctx.GetPlace());
+
+    const size_t x_rank = x->dims().size();
+    const size_t out_rank = out->dims().size();
+
+    const int n_fft = ctx.Attr<int>("n_fft");
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const bool normalized = ctx.Attr<bool>("normalized");
+    const bool onesided = ctx.Attr<bool>("onesided");
+
+    const int n_frames = out->dims()[out_rank - 1];
+    const int seq_length = x->dims()[x_rank - 1];
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    std::vector<int64_t> axes = {1};
+
+    // Frame
+    Tensor frames;
+    framework::DDim frames_dims(out->dims());
+    frames_dims.at(axes.back()) = n_fft;
+    frames.mutable_data<T>(frames_dims, ctx.GetPlace());
+    FrameFunctor<DeviceContext, T>()(dev_ctx, x, &frames, seq_length, n_fft,
+                                     n_frames, hop_length, /*is_grad*/ false);
+
+    // FFTR2C
+    FFTNormMode normalization;
+    if (normalized) {
+      normalization = get_norm_from_string("ortho", true);
+    } else {
+      normalization = get_norm_from_string("backward", true);
+    }
+    FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
+
+    if (onesided) {
+      fft_r2c_func(dev_ctx, &frames, out, axes, normalization, true);
+    } else {
+      framework::DDim onesided_dims(out->dims());
+      const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
+      onesided_dims.at(axes.back()) = onesided_axis_size;
+      Tensor onesided_out;
+      onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
+      fft_r2c_func(dev_ctx, &frames, &onesided_out, axes, normalization, true);
+      fill_conj<DeviceContext, C>(dev_ctx, &onesided_out, out, axes);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class StftGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    const size_t dy_rank = dy->dims().size();
+    const size_t dx_rank = dx->dims().size();
+
+    const int n_fft = ctx.Attr<int>("n_fft");
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const bool normalized = ctx.Attr<bool>("normalized");
+    const bool onesided = ctx.Attr<bool>("onesided");
+    const int n_frames = dy->dims()[dy_rank - 1];
+    const int seq_length = dx->dims()[dx_rank - 1];
+
+    std::vector<int64_t> axes = {1};
+    Tensor d_frames;
+    framework::DDim d_frames_dims(dy->dims());
+    d_frames_dims.at(axes.back()) = n_fft;
+    d_frames.mutable_data<T>(d_frames_dims, ctx.GetPlace());
+
+    Tensor complex_d_frames;
+    complex_d_frames.mutable_data<C>(d_frames_dims, ctx.GetPlace());
+
+    // dy -> d_frames
+    FFTNormMode normalization;
+    if (normalized) {
+      normalization = get_norm_from_string("ortho", true);
+    } else {
+      normalization = get_norm_from_string("backward", true);
+    }
+    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
+
+    if (!onesided) {
+      fft_c2c_func(dev_ctx, dy, &complex_d_frames, axes, normalization, false);
+    } else {
+      Tensor full_dy;
+      full_dy.mutable_data<C>(d_frames_dims, ctx.GetPlace());
+      auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
+                                          dy->dims().at(axes.back()));
+      auto rank = dy->dims().size();
+
+      std::vector<int> pads(rank * 2, 0);
+      pads[axes.back() * 2 + 1] = zero_length;
+
+      phi::funcs::PaddingFunctor<DeviceContext, C>(
+          rank, ctx.template device_context<DeviceContext>(), pads,
+          static_cast<C>(0), *dy, &full_dy);
+      fft_c2c_func(dev_ctx, &full_dy, &complex_d_frames, axes, normalization,
+                   false);
+    }
+    framework::TransComplexToReal(
+        framework::TransToProtoVarType(d_frames.dtype()),
+        framework::TransToProtoVarType(complex_d_frames.dtype()),
+        complex_d_frames, &d_frames);
+
+    // d_frames -> dx
+    FrameFunctor<DeviceContext, T>()(dev_ctx, &d_frames, dx, seq_length, n_fft,
+                                     n_frames, hop_length, /*is_grad*/ true);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 18ac979b48ef39fbab841927bf41c9c8579a5766..5605d326f2cfa53e5f3f8aba1b65d1a2cd3e8893 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext(
               cuda_ctx,
               platform::errors::InvalidArgument(
                   "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          // Note: A trick method to init context, why GetAllocator interface
-          // needs a stream parameter?
           dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
-                                    .GetAllocator(p, cuda_ctx->stream())
+                                    .GetAllocator(p)
                                     .get());
           cuda_ctx->PartialInitWithAllocator();
           dev_ctx->SetGenerator(
@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
   phi::GPUContext::PartialInitWithoutAllocator();
   cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
-  workspace_.reset(new phi::DnnWorkspaceHandle(
-      memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(place, phi::GPUContext::stream())
-          .get()));
+  auto& instance = memory::allocation::AllocatorFacade::Instance();
+  instance.SetDefaultStream(place, phi::GPUContext::stream());
+  workspace_.reset(
+      new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
 }
 
 CUDADeviceContext::~CUDADeviceContext() = default;
@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
     // return workspace_.get();
     return phi::DnnWorkspaceHandle(
         memory::allocation::AllocatorFacade::Instance()
-            .GetAllocator(GetPlace(), phi::GPUContext::stream())
+            .GetAllocator(GetPlace())
             .get());
   }
   return phi::GPUContext::cudnn_workspace_handle();
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index ce2e49a1ccd39accb8830943759d361d15d12d9d..d507153d3f5b47ef072f9da0276073448127fb9c 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -118,8 +118,9 @@ float CpuUtilization::GetCpuUtilization() {
   float busy_time = (system_kernel_time_end - system_kernel_time_start) +
                     (system_user_time_end - system_user_time_start);
   float idle_time = system_idle_time_end - system_idle_time_start;
-  cpu_utilization = busy_time / (busy_time + idle_time);
-
+  if (busy_time + idle_time != 0) {
+    cpu_utilization = busy_time / (busy_time + idle_time);
+  }
 #elif defined(__linux__)
   float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
                     (system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
@@ -127,7 +128,9 @@ float CpuUtilization::GetCpuUtilization() {
                     (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
                     (steal_end_ - steal_start_);
   float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
-  cpu_utilization = busy_time / (busy_time + idle_time);
+  if (busy_time + idle_time != 0) {
+    cpu_utilization = busy_time / (busy_time + idle_time);
+  }
 #else
   LOG(WARNING)
       << "Current System is not supported to get system cpu utilization"
@@ -148,13 +151,16 @@ float CpuUtilization::GetCpuCurProcessUtilization() {
   uint64_t end = FileTimeToUint64(end_);
   float busy_time = (process_kernel_time_end - process_kernel_time_start) +
                     (process_user_time_end - process_user_time_start);
-  cpu_process_utilization = busy_time / (end - start);
-  LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl;
+  if (end - start != 0) {
+    cpu_process_utilization = busy_time / (end - start);
+  }
 #elif defined(__linux__)
   float busy_time =
       (process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
       (process_tms_end_.tms_stime - process_tms_start_.tms_stime);
-  cpu_process_utilization = busy_time / (end_ - start_);
+  if (end_ - start_ != 0) {
+    cpu_process_utilization = busy_time / (end_ - start_);
+  }
 #else
   LOG(WARNING)
       << "Current System is not supported to get process cpu utilization"
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 46cbb3358c6c4d6b2b17cfc1e549db6376931389..ac46fbed10a2022324438e4718261813a5c38b19 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -44,6 +44,14 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
   return std::unique_ptr<Profiler>(new Profiler(options));
 }
 
+bool Profiler::IsCuptiSupported() {
+  bool supported = false;
+#ifdef PADDLE_WITH_CUPTI
+  supported = true;
+#endif
+  return supported;
+}
+
 Profiler::Profiler(const ProfilerOptions& options) {
   options_ = options;
   std::bitset<32> trace_switch(options_.trace_switch);
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index f9a8ece050492805226cccce001251c3cd2ad0c2..d24ee504bc6407230da875ab5e29251740d72822 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -43,6 +43,8 @@ class Profiler {
  public:
   static std::unique_ptr<Profiler> Create(const ProfilerOptions& options);
 
+  static bool IsCuptiSupported();
+
   void Prepare();
 
   void Start();
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index b43389866c7a8150846bef874f49bd72907f446f..de314d298c90ea7c70d9d244b78cbc46feae9a9c 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/dynload/cupti.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index cd56d343842686abc31343effc93cf1a4887411c..b471d6b79833a17eca35fe44c9d4917684aa8bcc 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <ctime>
 #include <string>
+#include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index cce663e410ddb5d4d987e58d16d578ec85ee6142..52a43c4ebe8d8811ceac406d4d68aa3f1963f7ce 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -32,12 +33,14 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/slice_utils.h"
+#include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "pybind11/detail/internals.h"
 
 namespace paddle {
 namespace pybind {
@@ -150,12 +153,22 @@ bool PyCheckTensor(PyObject* obj) {
 static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                      PyObject* kwargs) {
   EAGER_TRY
-  PADDLE_ENFORCE_EQ(
-      self->tensor.initialized(), true,
-      platform::errors::InvalidArgument(
-          "Tensor data of %s is Empty that indicates we have null tensor for "
-          "now, please check if it has no data and initialize it first.",
-          self->tensor.name()));
+  auto& api = pybind11::detail::npy_api::get();
+  if (!self->tensor.impl()) {
+    Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
+    Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
+    py_dims[0] = 0;
+    py_strides[0] = 0;
+
+    PyObject* array = api.PyArray_NewFromDescr_(
+        api.PyArray_Type_,
+        api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_FLOAT_), 1,
+        py_dims, py_strides, nullptr,
+        pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
+            pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
+        nullptr);
+    return array;
+  }
   auto tensor_dims = self->tensor.shape();
   auto numpy_dtype = TensorDtype2NumpyDtype(self->tensor.type());
   auto sizeof_dtype = paddle::framework::DataTypeSize(self->tensor.type());
@@ -167,7 +180,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
     py_strides[i] = sizeof_dtype * numel;
     numel *= py_dims[i];
   }
-  auto& api = pybind11::detail::npy_api::get();
+
   PyObject* array = api.PyArray_NewFromDescr_(
       api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype),
       tensor_dims.size(), py_dims, py_strides, nullptr,
@@ -175,6 +188,10 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
           pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
       nullptr);
 
+  if (!self->tensor.impl()->initialized()) {
+    return array;
+  }
+
   if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) {
     auto dense_tensor =
         std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
@@ -213,6 +230,20 @@ static PyObject* tensor_method__is_initialized(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method__is_dense_tensor_hold_allocation(
+    TensorObject* self, PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  auto dense_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
+  if (dense_tensor) {
+    return ToPyObject(dense_tensor->IsInitialized());
+  } else {
+    return ToPyObject(false);
+  }
+
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
@@ -552,10 +583,13 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     }
     if (op_type == "slice") {
       out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
-                                   paddle::experimental::Tensor(),
+                                   paddle::experimental::Tensor(), {}, {},
                                    std::move(attrs));
     } else if (op_type == "strided_slice") {
-      out = strided_slice_dygraph_function(self->tensor, attrs);
+      out = strided_slice_dygraph_function(
+          self->tensor, paddle::experimental::Tensor(),
+          paddle::experimental::Tensor(), paddle::experimental::Tensor(), {},
+          {}, {}, attrs);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Slice is only support slice and strided_slice, but we got %s which "
@@ -604,6 +638,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     auto select_index = paddle::experimental::Tensor(
         egr::Controller::Instance().GenerateUniqueName());
     auto idx_tensor = std::make_shared<phi::DenseTensor>();
+    select_index.set_impl(idx_tensor);
     auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
         egr::Controller::Instance().GetExpectedPlace());
     paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
@@ -617,6 +652,216 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(4) << "Call __setitem_eager_tensor";
+
+  auto self_tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+
+  PyObject* _index = PyTuple_GET_ITEM(args, 0);
+  PyObject* value_obj = PyTuple_GET_ITEM(args, 1);
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount while PyTuple_New
+  // https://github.com/python/cpython/blob/24b63c695ae0a95b06379eaadace66735abac1e2/Objects/tupleobject.c#L251
+  PyObject* index_ptr =
+      !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index_ptr, &_index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index_ptr);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
+
+  // TODO(pangyoki) add inplace(BumpInplaceVersion) if need
+
+  // 1. Check argumnets
+  bool parse_index = true;
+
+  // Check whether _index can be parsed.
+  const int size = PyTuple_GET_SIZE(index_ptr);
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject* slice_item = PyTuple_GetItem(index_ptr, dim);
+    if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) ||
+          slice_item == Py_Ellipsis || slice_item == Py_None)) {
+      parse_index = false;
+      break;
+    }
+  }
+
+  // 2. Call op set_value to speed up if the condition is met,
+  // otherwise call TensorToPyArray.
+  // TODO(liym27): Try not to call TensorToPyArray because it always
+  // copys data to cpu place, which reduces performance.
+  if (parse_index) {
+    std::vector<int> axes, starts, ends, steps, decrease_axes, none_axes,
+        infer_flags, list_select_idxs;
+    // if index is a list, list_select_flag will be true
+    bool list_select_flag = false;
+    ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends, &steps,
+                       &decrease_axes, &none_axes, &infer_flags,
+                       &list_select_idxs, &list_select_flag);
+
+    framework::AttributeMap attrs = {{"axes", axes},
+                                     {"starts", starts},
+                                     {"ends", ends},
+                                     {"steps", steps},
+                                     {"decrease_axes", decrease_axes},
+                                     {"none_axes", none_axes}};
+
+    if (egr::Controller::Instance().HasGrad()) {
+      PADDLE_ENFORCE_EQ(
+          egr::egr_utils_api::IsLeafTensor(self->tensor) &&
+              !egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient(),
+          false, platform::errors::InvalidArgument(
+                     "Leaf Tensor (%s) that doesn't stop gradient can't use "
+                     "inplace strategy.",
+                     self->tensor.name()));
+    }
+
+    paddle::experimental::Tensor value_tensor;
+
+    if (PyCheckTensor(value_obj)) {
+      value_tensor = reinterpret_cast<TensorObject*>(value_obj)->tensor;
+
+      // pass the stop_gradient from value to tensor
+      if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
+          egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
+        egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+      }
+    } else if (py::isinstance<py::array>(value_obj)) {
+      paddle::experimental::Tensor value_tensor_tmp(
+          std::make_shared<phi::DenseTensor>(),
+          egr::Controller::Instance().GenerateUniqueName());
+      py::object value_obj_tmp(py::handle(value_obj), true);
+      py::object value = value_obj_tmp;
+      if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+        if (!py::isinstance<py::array_t<float>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<float>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() ==
+                 paddle::experimental::DataType::FLOAT64) {
+        if (!py::isinstance<py::array_t<double>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<double>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() ==
+                 paddle::experimental::DataType::INT32) {
+        if (!py::isinstance<py::array_t<int32_t>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<int32_t>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() ==
+                 paddle::experimental::DataType::INT64) {
+        if (!py::isinstance<py::array_t<int64_t>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<int64_t>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() == paddle::experimental::DataType::BOOL) {
+        if (!py::isinstance<py::array_t<bool>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<bool>(value_obj_tmp);
+        }
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "When assign a numpy.np value to a paddle.Tensor, "
+            "the data type of the paddle.Tensor must be bool, "
+            "float32, int32 or int64, "
+            "please check the type of tensor."));
+      }
+
+      if (value_tensor_tmp.place() == paddle::PlaceType::kUNK) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        SetTensorFromPyArray(
+            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+            value, platform::Place(platform::CUDAPlace(0)), false);
+#else
+        SetTensorFromPyArray(
+            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+            value, platform::Place(platform::CPUPlace()), false);
+#endif
+      } else {
+        SetTensorFromPyArray(
+            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+            value, value_tensor_tmp.inner_place(), false);
+      }
+
+      value_tensor = value_tensor_tmp;
+    } else {
+      py::object value_obj_tmp(py::handle(value_obj), true);
+      // convert the value to self data type
+      if (py::isinstance<py::float_>(value_obj_tmp) ||
+          py::isinstance<py::int_>(value_obj_tmp) ||
+          py::isinstance<py::bool_>(value_obj_tmp)) {
+        if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+          attrs["fp32_values"] =
+              std::vector<float>{value_obj_tmp.cast<float>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::FLOAT64) {
+          attrs["fp64_values"] =
+              std::vector<double>{value_obj_tmp.cast<double>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::INT32) {
+          attrs["int32_values"] =
+              std::vector<int32_t>{value_obj_tmp.cast<int32_t>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::INT64) {
+          attrs["int64_values"] =
+              std::vector<int64_t>{value_obj_tmp.cast<int64_t>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::BOOL) {
+          attrs["bool_values"] = std::vector<int>{value_obj_tmp.cast<bool>()};
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "When assign a value to a paddle.Tensor, "
+              "the data type of the paddle.Tensor must be bool, "
+              "float32, int32 or int64, "
+              "please check the type of tensor."));
+        }
+        attrs["shape"] = std::vector<int64_t>{1};
+
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Value type error. The assign value allows "
+            "numpy.ndarray, integer, float or bool, "
+            "but received %s.",
+            Py_TYPE(value_obj)));
+      }
+    }
+
+    {
+      // Release gil and do tracing
+      py::gil_scoped_release release;
+      self->tensor = set_value_dygraph_function(self->tensor, value_tensor, {},
+                                                {}, {}, attrs);
+    }
+  } else {
+    auto self_numpy = TensorToPyArray(*self_tensor);
+    VLOG(4) << "parse_index is false";
+    if (PyCheckTensor(_index)) {
+      VLOG(4) << "index is tensor";
+      auto index_tensor = static_cast<phi::DenseTensor*>(
+          reinterpret_cast<TensorObject*>(_index)->tensor.impl().get());
+      auto index_numpy = TensorToPyArray(*index_tensor);
+      self_numpy[index_numpy] = py::object(py::handle(value_obj), true);
+    } else {
+      VLOG(4) << "index is not tensor";
+      self_numpy[_index] = py::object(py::handle(value_obj), true);
+    }
+    if (self->tensor.place() == paddle::PlaceType::kUNK) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      SetTensorFromPyArray(self_tensor, self_numpy,
+                           platform::Place(platform::CUDAPlace(0)), false);
+#else
+      SetTensorFromPyArray(self_tensor, self_numpy,
+                           platform::Place(platform::CPUPlace()), false);
+#endif
+    } else {
+      SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.inner_place(),
+                           false);
+    }
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
                                            PyObject* kwargs) {
   EAGER_TRY
@@ -825,6 +1070,10 @@ PyMethodDef variable_methods[] = {
     {"_is_initialized",
      (PyCFunction)(void (*)(void))tensor_method__is_initialized,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_dense_tensor_hold_allocation",
+     (PyCFunction)(
+         void (*)(void))tensor_method__is_dense_tensor_hold_allocation,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"copy_", (PyCFunction)(void (*)(void))tensor_method_copy_,
@@ -857,6 +1106,9 @@ PyMethodDef variable_methods[] = {
     {"_getitem_index_not_tensor",
      (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"__setitem_eager_tensor__",
+     (PyCFunction)(void (*)(void))tensor_method__setitem_eager_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"_register_grad_hook",
      (PyCFunction)(void (*)(void))tensor_register_grad_hook,
      METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index ff8980d727e70a41223878f22f019353f8b71972..a610c31ee8946b9e3e9f3bfdf50c7448d4755c8d 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -52,6 +52,12 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+PyObject* tensor_properties_is_leaf(TensorObject* self, void* closure) {
+  EAGER_TRY
+  return ToPyObject(egr::egr_utils_api::IsLeafTensor(self->tensor));
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 int tensor_properties_set_name(TensorObject* self, PyObject* value,
                                void* closure) {
   EAGER_TRY
@@ -179,6 +185,7 @@ struct PyGetSetDef variable_properties[] = {
      nullptr},
     {"dtype", (getter)tensor_properties_get_dtype, nullptr, nullptr, nullptr},
     {"type", (getter)tensor_properties_get_type, nullptr, nullptr, nullptr},
+    {"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3a2c93309f34454ae0ce2d3419e3fce474f7c06b..7a00f91da2e36425e42e108176251093a9e9d982 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -386,46 +386,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
   return result;
 }
 
-// cast numpy type form S to T, this may allocate new memory
-template <class T, class S>
-static py::array_t<T> CastNumpyType(py::array_t<S> array) {
-  if (std::is_same<T, S>::value) {
-    return array;
-  }
-  auto dim = array.ndim();
-  std::vector<py::ssize_t> result_shape(dim);
-  for (auto i = 0; i < dim; i++) {
-    result_shape[i] = array.shape(i);
-  }
-
-  py::array_t<T> result(result_shape);
-
-  return py::vectorize([](S s) { return static_cast<T>(s); })(array);
-}
-
-template <class T>
-static py::array_t<T> CastNumpyArray(const py::object &array) {
-  if (py::isinstance<py::array_t<float>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<float>>());
-  } else if (py::isinstance<py::array_t<double>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<double>>());
-  } else if (py::isinstance<py::array_t<int32_t>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
-  } else if (py::isinstance<py::array_t<int64_t>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
-  } else if (py::isinstance<py::array_t<bool>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<bool>>());
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Value type error. The assign numpy value allows integer, float, "
-        "double and bool, "
-        "but received %s.",
-        Py_TYPE(array.ptr())->tp_name));
-  }
-  // can't reach here
-  return py::array_t<T>();
-}
-
 static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
     const PyNameVarBaseMap &map) {
   imperative::NameVarBaseMap result;
@@ -854,27 +814,29 @@ void BindImperative(py::module *m_ptr) {
                 py::object value = value_obj;
                 if (self->DataType() == framework::proto::VarType::FP32) {
                   if (!py::isinstance<py::array_t<float>>(value_obj)) {
-                    value = CastNumpyArray<float>(value_obj);
+                    value = pybind11::detail::CastNumpyArray<float>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::FP64) {
                   if (!py::isinstance<py::array_t<double>>(value_obj)) {
-                    value = CastNumpyArray<double>(value_obj);
+                    value = pybind11::detail::CastNumpyArray<double>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::INT32) {
                   if (!py::isinstance<py::array_t<int32_t>>(value_obj)) {
-                    value = CastNumpyArray<int32_t>(value_obj);
+                    value =
+                        pybind11::detail::CastNumpyArray<int32_t>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::INT64) {
                   if (!py::isinstance<py::array_t<int64_t>>(value_obj)) {
-                    value = CastNumpyArray<int64_t>(value_obj);
+                    value =
+                        pybind11::detail::CastNumpyArray<int64_t>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::BOOL) {
                   if (!py::isinstance<py::array_t<bool>>(value_obj)) {
-                    value = CastNumpyArray<bool>(value_obj);
+                    value = pybind11::detail::CastNumpyArray<bool>(value_obj);
                   }
                 } else {
                   PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index d8750c1d6c115a6de8a493cac4ccadbd47bc10fd..65b5beb865d1c267d50e3251d5a0e711942f737a 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -38,7 +38,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"assign", {"X"}},
     {"reshape2", {"X", "Shape"}},
     {"expand", {"X", "ExpandTimes"}},
-    {"slice", {"Input", "StartsTensor", "EndsTensor"}},
+    {"slice",
+     {"Input", "StartsTensor", "EndsTensor", "StartsTensorList",
+      "EndsTensorList"}},
+    {"strided_slice",
+     {"Input", "StartsTensor", "EndsTensor", "StridesTensor",
+      "StartsTensorList", "EndsTensorList", "StridesTensorList"}},
+    {"set_value",
+     {"Input", "ValueTensor", "StartsTensorList", "EndsTensorList",
+      "StepsTensorList"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"X", "InScale", "InAccum", "InState"}},
     {"nll_loss", {"X", "Label", "Weight"}},
@@ -89,6 +97,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
       "CustomDistAlias", "CustomDistAliasProbs"}},
     {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}},
+    {"group_norm", {"X", "Scale", "Bias"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index dcfad030a689c278b72a0061cfb170762d1a3156..f5c853fb4b8ee251edac8bc69cf64da87ac71189 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3322,6 +3322,7 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<paddle::platform::Profiler>(m, "_Profiler")
       .def("create", &paddle::platform::Profiler::Create,
            py::return_value_policy::take_ownership)
+      .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
       .def("prepare",
            [](paddle::platform::Profiler *profiler) {
              platform::EnableHostEventRecorder();
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 6849fcb039410f95d829b9bb793a856f1485bd6c..bf459bd46842168a31608e40fadf38721976ca6d 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -52,6 +52,46 @@ constexpr int NPY_UINT16_ = 4;
 constexpr int NPY_COMPLEX64 = 14;
 constexpr int NPY_COMPLEX128 = 15;
 
+// cast numpy type form S to T, this may allocate new memory
+template <class T, class S>
+static py::array_t<T> CastNumpyType(py::array_t<S> array) {
+  if (std::is_same<T, S>::value) {
+    return array;
+  }
+  auto dim = array.ndim();
+  std::vector<py::ssize_t> result_shape(dim);
+  for (auto i = 0; i < dim; i++) {
+    result_shape[i] = array.shape(i);
+  }
+
+  py::array_t<T> result(result_shape);
+
+  return py::vectorize([](S s) { return static_cast<T>(s); })(array);
+}
+
+template <class T>
+static py::array_t<T> CastNumpyArray(const py::object &array) {
+  if (py::isinstance<py::array_t<float>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<float>>());
+  } else if (py::isinstance<py::array_t<double>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<double>>());
+  } else if (py::isinstance<py::array_t<int32_t>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
+  } else if (py::isinstance<py::array_t<int64_t>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
+  } else if (py::isinstance<py::array_t<bool>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<bool>>());
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Value type error. The assign numpy value allows integer, float, "
+        "double and bool, "
+        "but received %s.",
+        Py_TYPE(array.ptr())->tp_name));
+  }
+  // can't reach here
+  return py::array_t<T>();
+}
+
 // Note: Since float16 is not a builtin type in C++, we register
 // paddle::platform::float16 as numpy.float16.
 // Ref: https://github.com/pybind/pybind11/issues/1776
diff --git a/paddle/phi/kernels/cpu/pad3d_kernel.cc b/paddle/phi/kernels/cpu/pad3d_kernel.cc
index 0dc01f485f3aa9ba6ff0b6d089887ff04847054c..68bd92168364d927455e86eb736a2134b2bff6af 100644
--- a/paddle/phi/kernels/cpu/pad3d_kernel.cc
+++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/pad3d_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -574,5 +575,13 @@ void Pad3dKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    pad3d, CPU, ALL_LAYOUT, phi::Pad3dKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(pad3d,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Pad3dKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index acc31d68b785908a88fea14e5826ccfde9ff5970..68e986c334ecb872739c332e32a565b5df13bb3d 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -50,11 +50,15 @@ struct exponential_transform {
 
   HOSTDEVICE inline T operator()(T val) const {
 #if defined(__NVCC__) || defined(__HIPCC__)
-    if (std::is_same<T, double>::value) {
-      return static_cast<T>(-1.0) / lambda_ * log(val);
-    } else {
-      return static_cast<T>(-1.0) / lambda_ * __logf(val);
+    T log = -std::numeric_limits<T>::epsilon() / 2;
+    if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
+      if (std::is_same<T, double>::value) {
+        log = logf(val);
+      } else {
+        log = __logf(val);
+      }
     }
+    return static_cast<T>(-1.0) / lambda_ * log;
 #else
     return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
 #endif
@@ -114,13 +118,19 @@ struct normal_transform {
 namespace kps = phi::kps;
 
 /*********************** Distribution Function *************************/
-template <typename T>
-struct uniform_distribution;
 
 template <typename T>
 struct normal_distribution;
 
 #if defined(__NVCC__)
+template <typename T>
+struct uniform_distribution {
+  __device__ inline T operator()(curandStatePhilox4_32_10_t *state) const {
+    return static_cast<T>(curand_uniform(state));
+  }
+  static constexpr int kReturnsCount = 1;
+};
+
 template <>
 struct uniform_distribution<float> {
   __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
@@ -177,6 +187,14 @@ struct normal_distribution<double> {
 };
 
 #else
+template <typename T>
+struct uniform_distribution {
+  __device__ inline T operator()(hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform(state);
+  }
+  static constexpr int kReturnsCount = 1;
+};
+
 template <>
 struct uniform_distribution<float> {
   __device__ inline float4 operator()(
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..b285c5bdbbfc078a510797a13e07f726b970fa6e
--- /dev/null
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -0,0 +1,274 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/malloc.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct IsComplex : public std::false_type {};
+
+template <>
+struct IsComplex<::phi::dtype::complex<float>> : public std::true_type {};
+
+template <>
+struct IsComplex<::phi::dtype::complex<double>> : public std::true_type {};
+
+template <typename InputIterator, typename OutputIterator, typename BinaryOp>
+static void CubInclusiveScan(InputIterator x_iter,
+                             OutputIterator y_iter,
+                             size_t n,
+                             BinaryOp op,
+                             const phi::GPUContext &dev_ctx) {
+  paddle::memory::allocation::AllocationPtr allocation;
+  void *temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  for (size_t i = 0; i < 2; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cub::DeviceScan::InclusiveScan(temp_storage,
+                                       temp_storage_bytes,
+                                       x_iter,
+                                       y_iter,
+                                       op,
+                                       static_cast<int>(n),
+                                       dev_ctx.stream()));
+    if (i == 0 && temp_storage_bytes > 0) {
+      allocation =
+          paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
+      temp_storage = allocation->ptr();
+    }
+  }
+}
+
+template <typename T>
+static auto MakeThrustReverseIterator(T *x) {
+  return thrust::reverse_iterator<thrust::device_ptr<T>>(
+      thrust::device_pointer_cast(x));
+}
+
+template <typename T, typename BinaryOp, bool kReverse>
+struct InclusiveScanOuterOrMidDimFunctor {
+  HOSTDEVICE InclusiveScanOuterOrMidDimFunctor(
+      const T *x, T *y, size_t mid_dim, size_t inner_dim, T init, BinaryOp op)
+      : x_(x),
+        y_(y),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        init_(init),
+        op_(op) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto outer_idx = idx / inner_dim_;
+    auto inner_idx = idx % inner_dim_;
+    if (kReverse) {
+      idx = outer_idx * mid_dim_ * inner_dim_ + (mid_dim_ - 1) * inner_dim_ +
+            inner_idx;
+    } else {
+      idx = outer_idx * mid_dim_ * inner_dim_ + inner_idx;
+    }
+
+    auto x_ptr = x_ + idx;
+    auto y_ptr = y_ + idx;
+    T acc_value = init_;
+    for (size_t i = 0; i < mid_dim_; ++i) {
+      acc_value = op_(acc_value, *x_ptr);
+      *y_ptr = acc_value;
+      if (kReverse) {
+        x_ptr -= inner_dim_;
+        y_ptr -= inner_dim_;
+      } else {
+        x_ptr += inner_dim_;
+        y_ptr += inner_dim_;
+      }
+    }
+  }
+
+ private:
+  const T *x_;
+  T *y_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T init_;
+  BinaryOp op_;
+};
+
+template <typename T,
+          typename BinaryOp,
+          size_t kThreadNumX,
+          size_t kThreadNumY,
+          bool kReverse>
+static __global__ void InclusiveScanInnerDimCUDAKernel(
+    const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) {
+  using RealT = phi::dtype::Real<T>;
+  constexpr auto kSharedBufferSize =
+      IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
+  __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
+  T *row_buf = reinterpret_cast<T *>(sbuf[threadIdx.y]);
+
+  size_t block_row = static_cast<size_t>(blockIdx.x * kThreadNumY);
+  size_t block_row_stride = static_cast<size_t>(gridDim.x * kThreadNumY);
+  for (; block_row < num_rows; block_row += block_row_stride) {
+    size_t row = block_row + threadIdx.y;
+    T block_total = init;
+
+    const T *row_x = x + row * row_size;
+    T *row_y = y + row * row_size;
+    for (size_t block_col = 0; block_col < row_size;
+         block_col += 2 * kThreadNumX) {
+      size_t col1, col2;
+      if (kReverse) {
+        col1 = row_size - 1 - block_col - threadIdx.x;
+        col2 = col1 - kThreadNumX;
+      } else {
+        col1 = block_col + threadIdx.x;
+        col2 = col1 + kThreadNumX;
+      }
+
+      if (row < num_rows) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = row_x[col1];
+        } else {
+          row_buf[threadIdx.x] = init;
+        }
+
+        if (col2 < row_size) {
+          row_buf[kThreadNumX + threadIdx.x] = row_x[col2];
+        } else {
+          row_buf[kThreadNumX + threadIdx.x] = init;
+        }
+
+        if (threadIdx.x == 0) {
+          row_buf[0] = op(row_buf[0], block_total);
+        }
+      }
+      __syncthreads();
+
+      for (size_t s = kThreadNumX, d = 1; s >= 1; s >>= 1, d <<= 1) {
+        if (row < num_rows && threadIdx.x < s) {
+          size_t offset = (2 * threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      for (size_t s = 2, d = kThreadNumX / 2; d >= 1; s <<= 1, d >>= 1) {
+        if (row < num_rows && threadIdx.x < s - 1) {
+          size_t offset = 2 * (threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      if (row < num_rows) {
+        if (col1 < row_size) row_y[col1] = row_buf[threadIdx.x];
+        if (col2 < row_size) row_y[col2] = row_buf[kThreadNumX + threadIdx.x];
+      }
+      block_total = row_buf[2 * kThreadNumX - 1];
+      __syncthreads();
+    }
+  }
+}
+
+template <typename T, typename BinaryOp>
+static void InclusiveScanInnerDim(const T *x,
+                                  T *y,
+                                  size_t outer_dim,
+                                  size_t inner_dim,
+                                  T init,
+                                  BinaryOp op,
+                                  bool reverse,
+                                  const phi::GPUContext &dev_ctx) {
+  constexpr size_t kThreadNumX = 16;
+  constexpr size_t kThreadNumY = 32;
+
+  size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
+  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
+  dim3 thread_dims(kThreadNumX, kThreadNumY);
+  if (reverse) {
+    InclusiveScanInnerDimCUDAKernel<
+        T,
+        BinaryOp,
+        kThreadNumX,
+        kThreadNumY,
+        /*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+        x, y, outer_dim, inner_dim, init, op);
+  } else {
+    InclusiveScanInnerDimCUDAKernel<
+        T,
+        BinaryOp,
+        kThreadNumX,
+        kThreadNumY,
+        /*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+        x, y, outer_dim, inner_dim, init, op);
+  }
+}
+
+template <typename T, typename BinaryOp>
+void InclusiveScan(const T *x,
+                   T *y,
+                   size_t outer_dim,
+                   size_t mid_dim,
+                   size_t inner_dim,
+                   T init,
+                   BinaryOp op,
+                   bool reverse,
+                   const phi::GPUContext &dev_ctx) {
+  if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
+
+  if (outer_dim == 1 && inner_dim == 1) {
+    if (reverse) {
+      auto x_reverse_iter = MakeThrustReverseIterator(x + mid_dim);
+      auto y_reverse_iter = MakeThrustReverseIterator(y + mid_dim);
+      CubInclusiveScan(x_reverse_iter, y_reverse_iter, mid_dim, op, dev_ctx);
+    } else {
+      CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
+    }
+  } else if (inner_dim != 1) {
+    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx,
+                                                    outer_dim * inner_dim);
+    if (reverse) {
+      for_range(
+          InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
+              x, y, mid_dim, inner_dim, init, op));
+    } else {
+      for_range(
+          InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/false>(
+              x, y, mid_dim, inner_dim, init, op));
+    }
+  } else {
+    InclusiveScanInnerDim<T, BinaryOp>(
+        x, y, outer_dim, mid_dim, init, op, reverse, dev_ctx);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
index 5d0097af2ca9abe4c7a4feb2d312068a5150ae1b..5a4ce3a2679b94a8d78aa4818c58728a15955bb6 100644
--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -17,11 +17,10 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_grad_kernel.h"
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
 namespace phi {
 
 template <typename MT, typename InT, typename OutT>
@@ -50,7 +49,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             const DenseTensor& mask,
                             DenseTensor* x_grad) {
   auto mask_size = mask.numel();
-  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(x_grad);
   if (mask_size <= 0) return;
   using Functor = MaskedSelectGradFunctor<bool, T, T>;
   phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 4918495ff7bed83d8fee7e811017927b53faf5f9..752a91fa48198c305763d66c0dbf8d2c9f2fa307 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -23,11 +23,32 @@ limitations under the License. */
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
-#include "paddle/fluid/platform/transform.h"
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/inclusive_scan.h"
 #include "paddle/phi/kernels/funcs/multinomial_functor.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/transform.h"
+
+DECLARE_bool(use_curand);
 
 namespace phi {
 
@@ -57,12 +78,12 @@ template <typename T>
 __global__ void GetCumulativeProbs(T* norm_probs_data,
                                    int64_t num_distributions,
                                    int64_t num_categories,
-                                   T* cumulative_probs) {
+                                   T* cumulative_probs_data) {
   int id = blockIdx.x;
   thrust::inclusive_scan(thrust::device,
                          norm_probs_data + id * num_categories,
                          norm_probs_data + (id + 1) * num_categories,
-                         cumulative_probs + id * num_categories);
+                         cumulative_probs_data + id * num_categories);
 }
 
 template <typename T>
@@ -80,7 +101,7 @@ struct RandomGeneratorCudaFunctor {
 };
 
 template <typename T>
-__device__ int binarySearchFunctor(T* cumulative_probs,
+__device__ int binarySearchFunctor(T* cumulative_probs_data,
                                    T* norm_probs_data,
                                    int num_categories,
                                    T rng_number) {
@@ -90,7 +111,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs,
   while (right - left > 0) {
     int mid = left + (right - left) / 2;
 
-    T temp_prob = cumulative_probs[mid];
+    T temp_prob = cumulative_probs_data[mid];
     if (temp_prob < rng_number) {
       left = mid + 1;
     } else {
@@ -114,26 +135,35 @@ __global__ void sampleMultinomialWithReplacement(
     int64_t* out_data,
     const int64_t num_distributions,
     const int64_t num_categories,
-    T* cumulative_probs,
-    T* norm_probs_data) {
+    T* cumulative_probs_data,
+    T* norm_probs_data,
+    uint64_t seed,
+    uint64_t offset,
+    bool use_curand) {
   // use binary search to get the selected category sample id.
-  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
+  // let cumulative_probs_data[id-1] < rng_data < cumulative_probs_data[id].
+  size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x +
+               threadIdx.x;
 
-  // for every distribution
-  int dist = blockIdx.y;
-  // for every sample
-  int sample = blockIdx.x * blockDim.x + threadIdx.x;
-  if (sample < num_samples) {
-    T rng_number = rng_data[sample + dist * num_samples];
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
 
-    // Find the bucket that a uniform random number lies in
-    int selected_category =
-        binarySearchFunctor<T>(cumulative_probs + dist * num_categories,
-                               norm_probs_data + dist * num_categories,
-                               num_categories,
-                               rng_number);
+  int sample = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
+    if (sample < num_samples) {
+      T rng_number = rng_data[sample + dist * num_samples];
+      if (use_curand) {
+        rng_number = static_cast<T>(curand_uniform4(&state).x);
+      }
+      // Find the bucket that a uniform random number lies in
+      int selected_category =
+          binarySearchFunctor<T>(cumulative_probs_data + dist * num_categories,
+                                 norm_probs_data + dist * num_categories,
+                                 num_categories,
+                                 rng_number);
 
-    out_data[sample + dist * num_samples] = selected_category;
+      out_data[sample + dist * num_samples] = selected_category;
+    }
   }
 }
 
@@ -172,6 +202,54 @@ void MultinomialKernel(const Context& dev_ctx,
                in_data_numel * sizeof(T),
                cudaMemcpyDeviceToHost);
 #endif
+    if (FLAGS_use_curand) {
+      for (size_t i = 0; i < num_distributions; ++i) {
+        int zero_num = 0;
+        for (size_t j = 0; j < num_categories; ++j) {
+          T weight = cpu_in_data[i * num_distributions + j];
+          PADDLE_ENFORCE_GE(
+              weight,
+              0,
+              errors::InvalidArgument(
+                  "Each element of multinomial'input must >= 0, but got %f.",
+                  weight));
+          if (weight == static_cast<T>(0)) {
+            zero_num++;
+          }
+        }
+        int valid_samples = num_categories - zero_num;
+        PADDLE_ENFORCE_LE(
+            num_samples,
+            valid_samples,
+            errors::InvalidArgument("When replacement=False, 'num_samples' "
+                                    "must less than or eaqual to the number of "
+                                    "positive item of input"));
+      }
+
+      // Refer to [gumbel softmax algorithm]
+      DenseTensor rand = EmptyLike<T, Context>(dev_ctx, x);
+      T* rand_data = rand.data<T>();
+      funcs::uniform_distribution<T> dist;
+      funcs::exponential_transform<T> trans(1.0);
+      funcs::distribution_and_transform<T>(dev_ctx, &rand, dist, trans);
+
+      funcs::ForRange<Context> for_range(dev_ctx, x.numel());
+      for_range([rand_data, in_data] __device__(size_t idx) {
+        rand_data[idx] = in_data[idx] / rand_data[idx];
+      });
+
+      if (num_samples == 1) {
+        ArgMaxKernel<T, Context>(
+            dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out);
+      } else {
+        std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
+        DenseTensor value =
+            Empty<T, Context>(dev_ctx, ScalarArray(out_dim_vec));
+        TopkKernel<T, Context>(
+            dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out);
+      }
+      return;
+    }
 
     funcs::MultinomialFunctor<T>(dev_ctx,
                                  cpu_out_data,
@@ -228,7 +306,8 @@ void MultinomialKernel(const Context& dev_ctx,
   auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor);
 
   // number of threads in a block is min(num_categories, 512)
-  dim3 block_norm(num_categories < 512 ? num_categories : 512);
+  int block_size = num_categories < 512 ? num_categories : 512;
+  dim3 block_norm(block_size);
   dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
   NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
       norm_probs_data,
@@ -238,16 +317,34 @@ void MultinomialKernel(const Context& dev_ctx,
       num_categories);
 
   // Get cumulative probability of each distribution. It's the same function
-  // of
-  // ``cumsum`` op.
+  // of ``cumsum`` op.
   DenseTensor cumulative_probs_tensor;
   cumulative_probs_tensor.Resize({num_distributions, num_categories});
-  auto* cumulative_probs = dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
-
-  dim3 block_cumsum(1);
-  dim3 grid_cumsum(num_distributions);
-  GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
-      norm_probs_data, num_distributions, num_categories, cumulative_probs);
+  auto* cumulative_probs_data =
+      dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
+
+  if (FLAGS_use_curand) {
+    // 'phi::funcs::InclusiveScan' has higher accuracy than
+    // 'thrust::inclusive_scan'
+    funcs::InclusiveScan<T, std::plus<T>>(
+        /*in*/ norm_probs_data,
+        /*out*/ cumulative_probs_data,
+        /*outer_dim*/ static_cast<size_t>(num_distributions),
+        /*mid_dim*/ static_cast<size_t>(num_categories),
+        /*inner_dim*/ static_cast<size_t>(1),
+        /*init*/ static_cast<T>(0),
+        std::plus<T>(),
+        /*reverse=*/false,
+        dev_ctx);
+  } else {
+    dim3 block_cumsum(1);
+    dim3 grid_cumsum(num_distributions);
+    GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
+        norm_probs_data,
+        num_distributions,
+        num_categories,
+        cumulative_probs_data);
+  }
 
   // Generate random number for each sample.
   std::random_device rd;
@@ -266,16 +363,30 @@ void MultinomialKernel(const Context& dev_ctx,
         RandomGeneratorCudaFunctor<T>(seed));
 
   // Sample the multinomial distributions.
-  dim3 block_sample(128);
-  dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
-  sampleMultinomialWithReplacement<
-      T><<<grid_sample, block_sample, 0, dev_ctx.stream()>>>(rng_data,
-                                                             num_samples,
-                                                             out_data,
-                                                             num_distributions,
-                                                             num_categories,
-                                                             cumulative_probs,
-                                                             norm_probs_data);
+  dim3 block(128);
+  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+  const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
+  int grid_y = std::min<int64_t>(num_distributions, prop.maxGridSize[1]);
+  dim3 grid((num_samples - 1) / block.x + 1, grid_y);
+
+  auto gen_cuda = dev_ctx.GetGenerator();
+  size_t curand4_loop_times =
+      (num_distributions + 4 * grid_y - 1) / (4 * grid_y);
+  // 'increment' shoulde be multiple of 4
+  uint64_t increment = curand4_loop_times * 4;
+  auto seed_offset = gen_cuda->IncrementOffset(increment);
+
+  sampleMultinomialWithReplacement<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      rng_data,
+      num_samples,
+      out_data,
+      num_distributions,
+      num_categories,
+      cumulative_probs_data,
+      norm_probs_data,
+      seed_offset.first,
+      seed_offset.second,
+      FLAGS_use_curand);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
index 2cef77cc0eef96d910d1b4f8c1b0ba736034063a..8f7cf716e79cf70c4e99274383e8623dd039c47d 100644
--- a/paddle/phi/kernels/gpu/pad3d_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -19,6 +19,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -585,4 +586,6 @@ PD_REGISTER_KERNEL(pad3d,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index a0ae9bc29dabe2172bfec4853315b4d6eb20c15b..fdb7a3b2cb447904face736d52be665e3c6c91cc 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .spawn import spawn  # noqa: F401
-from .fleet.launch import launch  # noqa: F401
+from .launch.main import launch  # noqa: F401
 
 from .parallel import init_parallel_env  # noqa: F401
 from .parallel import get_rank  # noqa: F401
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index c92142cf7384d2b0c76c1a5cb3b4e6ac257303a2..684db52a28d83e49e53790b0abd4db278a247865 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -1482,3 +1482,512 @@ register_distributed_operator_impl("matmul_v2",
                                    DistributedMatmulV2Impl1("row_parallel"))
 register_distributed_operator_impl(
     "matmul_v2", DistributedMatmulV2Impl2("replicate_parallel"))
+
+
+class DistributedMul(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedMul, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedMul("mul"))
+
+
+# ColumnParallel
+class DistributedMulImpl0(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedMulImpl0, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(y_dims_mapping[
+                -1]):
+            return False
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        if not _is_auto_compatible_for_matmul(dist_op):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
+        if dim_changed:
+            changed = True
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(src_op))
+
+        # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # check validation of inputs / outputs
+        for input_name in src_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                src_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in src_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                src_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        X_var = main_block.var(kwargs['X'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
+        Out_var = main_block.var(kwargs['Out'][0])
+
+        # TODO infer logic comm presentation
+        matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping(
+            Weight_var.name)[-1]
+        assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            matmul_col_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = matmul_col_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        # infer new var shape with op dist attr
+        x_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(X_var)
+        assert x_tensor_dist_attr is not None
+        identity_var_dist_attr = op_dist_attr.get_input_dist_attr(X_var.name)
+        assert identity_var_dist_attr is not None
+        ref_shape_x = infer_shape(main_block, X_var, x_tensor_dist_attr,
+                                  identity_var_dist_attr)
+        # infer out var shape with op dist attr
+        out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
+        assert out_tensor_dist_attr is not None
+        out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
+        assert out_var_dist_attr is not None
+        ref_shape_out = infer_shape(main_block, Out_var, out_tensor_dist_attr,
+                                    out_var_dist_attr)
+
+        intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_identity", 'tmp'])),
+            dtype=X_var.dtype,
+            shape=X_var.shape,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=X_var.stop_gradient)
+        # set intermediate_var_0's dist_attr with X_var's dist_attr
+        ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
+                                             identity_var_dist_attr)
+
+        check_variable_and_dtype(
+            X_var, 'tensor',
+            ['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity')
+        c_identity_op = main_block.append_op(
+            type='c_identity',
+            inputs={'X': [X_var]},
+            outputs={'Out': intermediate_var_0},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                'use_model_parallel': True,
+            })
+        if intermediate_var_0.shape != ref_shape_x:
+            intermediate_var_0.desc.set_shape(ref_shape_x)
+
+        check_variable_and_dtype(intermediate_var_0, 'x',
+                                 ['float16', 'float32', 'float64'], 'linear')
+        check_dtype(intermediate_var_0.dtype, 'dtype',
+                    ['float16', 'float32', 'float64'], 'linear')
+        # attrs = {'trans_x': False, 'trans_y': False}
+        attrs = {
+            "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
+            "y_num_col_dims": src_op.desc.attr("y_num_col_dims")
+        }
+        inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
+        mul_op = main_block.append_op(
+            type='mul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs)
+        if Out_var.shape != ref_shape_out:
+            Out_var.desc.set_shape(ref_shape_out)
+
+        # set dist op's dist_attr with serial op's dist_attr
+        # c_identity
+        identity_op_dist_attr = OperatorDistributedAttribute()
+        identity_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        identity_op_dist_attr.impl_type = op_dist_attr.impl_type
+        identity_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        # input
+        input_varname = c_identity_op.desc.input_arg_names()[0]
+        input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
+        assert input_dist_attr is not None, "dist_attr is {}".format(
+            op_dist_attr)
+        identity_op_dist_attr.set_input_dist_attr(input_varname,
+                                                  input_dist_attr)
+        # output
+        output_varname = c_identity_op.desc.output_arg_names()[0]
+        identity_op_dist_attr.set_output_dist_attr(output_varname,
+                                                   input_dist_attr)
+        ctx.set_op_dist_attr_for_program(c_identity_op, identity_op_dist_attr)
+
+        # matmulv2
+        matmulv2_op_dist_attr = OperatorDistributedAttribute()
+        matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
+        matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        for input_varname in mul_op.desc.input_arg_names():
+            if input_varname in src_op.desc.input_arg_names():
+                input_dist_attr = op_dist_attr.get_input_dist_attr(
+                    input_varname)
+                assert input_dist_attr is not None, "dist_attr is {}".format(
+                    op_dist_attr)
+                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
+                                                          input_dist_attr)
+            else:
+                input_var = main_block.var(input_varname)
+                tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(
+                    input_var)
+                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
+                                                          tensor_dist_attr)
+        for output_varname in mul_op.desc.output_arg_names():
+            output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
+            assert output_dist_attr is not None, "dist_attr is {}".format(
+                op_dist_attr)
+            matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
+                                                       output_dist_attr)
+        ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
+
+        # init param sync
+        if Weight_var.is_parameter and not op_dist_attr.is_recompute:
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
+                             rank_id)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
+
+
+# RowParallel
+class DistributedMulImpl1(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedMulImpl1, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        if not _is_auto_compatible_for_matmul(dist_op):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
+        if dim_changed:
+            changed = True
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(src_op))
+
+        # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # check validation of inputs / outputs
+        for input_name in src_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                src_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in src_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                src_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        X_var = main_block.var(kwargs['X'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
+        Out_var = main_block.var(kwargs['Out'][0])
+
+        # TODO infer logic comm presentation
+        matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping(
+            Weight_var.name)[-2]
+        assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            matmul_row_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = matmul_row_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'],
+                    'linear')
+        # attrs = {'trans_x': False, 'trans_y': False}
+        attrs = {
+            "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
+            "y_num_col_dims": src_op.desc.attr("y_num_col_dims")
+        }
+        inputs = {'X': X_var, 'Y': Weight_var}
+
+        # infer out var shape with op dist attr
+        out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
+        assert out_tensor_dist_attr is not None
+        out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
+        assert out_var_dist_attr is not None
+        ref_shape = infer_shape(main_block, Out_var, out_tensor_dist_attr,
+                                out_var_dist_attr)
+
+        intermediate_var_0 = main_block.create_var(
+            shape=Out_var.shape,
+            dtype=Out_var.dtype,
+            type=Out_var.type,
+            lod_level=Out_var.lod_level,
+            persistable=False,
+            is_data=False,
+            need_check_feed=Out_var.desc.need_check_feed())
+        # set intermediate_var_0's dist_attr with Out_var's dist_attr
+        ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
+                                             out_var_dist_attr)
+
+        mul_op = main_block.append_op(
+            type='mul',
+            inputs=inputs,
+            outputs={'Out': intermediate_var_0},
+            attrs=attrs)
+        if intermediate_var_0.shape != ref_shape:
+            intermediate_var_0.desc.set_shape(ref_shape)
+
+        c_allreduce_sum_op = main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': intermediate_var_0},
+            outputs={'Out': Out_var},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+        if Out_var.shape != ref_shape:
+            Out_var.desc.set_shape(ref_shape)
+
+        # set dist op's dist_attr with serial op's dist_attr
+        # matmulv2
+        matmulv2_op_dist_attr = OperatorDistributedAttribute()
+        matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
+        matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        for input_varname in mul_op.desc.input_arg_names():
+            input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
+            assert input_dist_attr is not None, "dist_attr is {}".format(
+                op_dist_attr)
+            matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
+                                                      input_dist_attr)
+        output_varname = mul_op.desc.output_arg_names()[0]
+        output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
+        assert output_dist_attr is not None, "dist_attr is {}".format(
+            op_dist_attr)
+        matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
+                                                   output_dist_attr)
+        ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
+
+        # allreduce
+        allreduce_op_dist_attr = OperatorDistributedAttribute()
+        allreduce_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type
+        allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        for input_varname in c_allreduce_sum_op.desc.input_arg_names():
+            input_var = main_block.var(input_varname)
+            tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var)
+            assert tensor_dist_attr is not None
+            allreduce_op_dist_attr.set_input_dist_attr(input_varname,
+                                                       tensor_dist_attr)
+        for output_varname in c_allreduce_sum_op.desc.output_arg_names():
+            output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
+            assert output_dist_attr is not None, "dist_attr is {}".format(
+                op_dist_attr)
+            allreduce_op_dist_attr.set_output_dist_attr(output_varname,
+                                                        output_dist_attr)
+        ctx.set_op_dist_attr_for_program(c_allreduce_sum_op,
+                                         allreduce_op_dist_attr)
+
+        # init param sync
+        if Weight_var.is_parameter and not op_dist_attr.is_recompute:
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
+                             rank_id)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
+
+
+# ReplicateParallel
+class DistributedMulImpl2(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedMulImpl2, self).__init__(name)
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        if not _is_auto_compatible_for_matmul(dist_op):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
+        if dim_changed:
+            changed = True
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl("mul",
+                                   DistributedMulImpl0("column_parallel"))
+register_distributed_operator_impl("mul", DistributedMulImpl1("row_parallel"))
+register_distributed_operator_impl("mul",
+                                   DistributedMulImpl2("replicate_parallel"))
diff --git a/python/paddle/distributed/launch/__init__.py b/python/paddle/distributed/launch/__init__.py
index f39bb76114345d7144e1f5b005eb42792a84cf14..4ce89fa36b06b25cfc7409c093ea227393ae3111 100644
--- a/python/paddle/distributed/launch/__init__.py
+++ b/python/paddle/distributed/launch/__init__.py
@@ -13,69 +13,3 @@
 # limitations under the License.
 
 __all__ = []
-'''
-Paddle distributed training entry ``python -m paddle.distributed.launch``.
-
-Help
-
-# for arg usage and explanation, try the following command
-# python -m paddle.distributed.launch -h
-
-Collective Mode
-
-Case 1: 1 node
-
-use all visible devices
-# python -m paddle.distributed.launch train.py
-
-use specified devices
-# python -m paddle.distributed.launch --devices=0,1,2,3 train.py
-
-Case 2: multi-node, auto detect ip/port
-
-# python -m paddle.distributed.launch --nnodes 2 train.py
-# auto print following command
-# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py
-# then copy and paste above command to other nodes
-
-Case 3: multi-node, specified master/rendezvous server
-
-# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py
-# the master ip must be one of the node and the port must available
-
-Parameter Server Mode
-
-Case 1.1: 1 node, 1 ps, 1 trainer
-
-# python -m paddle.distributed.launch --mode ps train.py
-# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py
-
-Case 1.2: 1 node, 2 ps, 2 trainer
-
-# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py
-
-Case 2: 2 node, 2 ps, 2 trainer per node
-
-# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py
-# auto print following command
-# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
-# then copy and paste above command to other nodes
-
-Case 3: multi-node, specified master/rendezvous server
-
-# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
-# the master ip must be one of the node and the port must available
-
-Case 4: specified servers and trainers in each node
-
-python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
-
-
-Elastic Mode
-
-# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
-# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py
-
-# once the peer number changes between 2:3, the strategy holds
-
-'''
diff --git a/python/paddle/distributed/launch/__main__.py b/python/paddle/distributed/launch/__main__.py
index 9cd6f4408c9897d5eaeaa0fd31602c4a0f6de09f..42f844ca71774868a40405db26afcb1b694df2e3 100644
--- a/python/paddle/distributed/launch/__main__.py
+++ b/python/paddle/distributed/launch/__main__.py
@@ -12,31 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .context import Context
-from . import controllers
+from .main import launch
 
-
-def launch():
-    # initialize the context to run
-    ctx = Context()
-
-    if ctx.is_legacy_mode():
-
-        # legacy mode
-        from paddle.distributed.fleet import launch
-        launch.launch()
-
-    else:
-
-        # initialize the selected controller
-        c = controllers.init(ctx)
-
-        # run the pods
-        c.run()
-
-        # manager or just wait pod
-        c.finalize()
-
-
-if __name__ == "__main__":
-    launch()
+launch()
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index e13bb2a5f0ba720037e488e92d1777a9a9111b68..510f49d8246f128c896712e9e0ad0776fa6f7626 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -82,6 +82,12 @@ class Context(object):
         logger.addHandler(ch)
         return logger
 
+    def continous_log(self) -> bool:
+        if self.args.log_level.upper() in ['DEBUG', 'ERROR']:
+            return True
+        else:
+            return False
+
     def set_env_in_args(self):
         for k, v in env_args_mapping.items():
             if k in self.envs:
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index d504a11e5f3d1cfc06c73cbb4b723e8758de52ce..b624281e44db38ccd1e52889dc1134ff929c7706 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -20,7 +20,7 @@ env_args_mapping = {
     'PADDLE_MASTER': 'master',
     'PADDLE_DEVICES': 'devices',
     'PADDLE_NNODES': 'nnodes',
-    'PADDLE_MODE': 'mode',
+    'PADDLE_RUN_MODE': 'run_mode',
     'PADDLE_LOG_LEVEL': 'log_level',
     'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
     'PADDLE_JOB_ID': 'job_id',
@@ -60,7 +60,7 @@ def parse_args():
         "--legacy", type=bool, default=False, help="use legacy launch")
 
     base_group.add_argument(
-        "--rank", type=int, default=-1, help="the peer rank")
+        "--rank", type=int, default=-1, help="the node rank")
 
     base_group.add_argument(
         "--log_level", type=str, default="INFO", help="log level. Default INFO")
@@ -69,7 +69,7 @@ def parse_args():
         "--nnodes",
         type=str,
         default="1",
-        help="the number of peers, i.e. pod/node number")
+        help="the number of nodes, i.e. pod/node number")
 
     base_group.add_argument(
         "--nproc_per_node",
@@ -83,7 +83,7 @@ def parse_args():
         default="log",
         help="the path for each process's log. Default ./log")
     base_group.add_argument(
-        "--mode",
+        "--run_mode",
         type=str,
         default="collective",
         help="run mode of the job, collective/ps/ps-heter")
@@ -146,6 +146,6 @@ def parse_args():
         "--elastic_timeout",
         type=int,
         default=30,
-        help="seconds to wait before elastic perform training")
+        help="seconds to wait before elastic job begin to train")
 
     return parser.parse_known_args()
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index c3fa4e6e07de9bca8acd8db585be679e3cf0244c..0a6c1c4002abb3d291c47748eddad201fc0d2839 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -115,46 +115,6 @@ class CollectiveElasticController(CollectiveController):
 
         self.master.register_heartbeat(self.job.id, self.pod.name)
 
-    def watch(self) -> bool:
-        '''
-        watch self and peer status, return true to exit
-        '''
-
-        self.ctx.logger.info("Watching {}".format(self.pod))
-        while not self.ctx.status.is_done():
-            # self status
-            status = self.pod.watch(timeout=2)
-            self.ctx.logger.debug("Pod status {}, Ctx status {}".format(
-                status, self.ctx.status.current()))
-
-            # completed
-            if status == self.ctx.status.COMPLETED:
-                self.master.set_status(status)
-                self.ctx.status.complete()
-                self.ctx.logger.info("Pod complete {}".format(status))
-                return True
-
-            # self failure
-            elif status == self.ctx.status.FAILED:
-                self.master.set_status(status)
-                self.master.restart_peer()
-                self.ctx.logger.info("Pod failed {}".format(status))
-                self.pod.stop()
-
-                if self.ctx.args.elastic_level <= 0:
-                    return True
-                else:
-                    return False
-
-            # peer failure
-            if self.ctx.status.is_restarting() and self.master.get_status(
-            ) != self.ctx.status.COMPLETED:
-                self.pod.stop()
-                return False
-
-            #peers = self.master.fetch_peer_alive()
-            #print("peers {}".format(peers))
-
     def run(self):
 
         timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10
@@ -164,6 +124,8 @@ class CollectiveElasticController(CollectiveController):
 
             self.build_job()
 
+            self.ctx.logger.info("Waiting peer ready...")
+
             ok, replicas = self.master.wait_peer_ready(
                 self.job.replicas_min, self.job.replicas_max, timeout)
             if ok:
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 60e34b85a12bc46636f7edb9bbb4926eddfc73ba..08345a2a1f76b84cfde96667e6329bc1b28c18d4 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -40,7 +40,7 @@ class ControllerBase(object):
         self.master = Master.factory(self.ctx)
 
         self.job = Job(nnodes=self.ctx.args.nnodes,
-                       mode=self.ctx.args.mode,
+                       mode=self.ctx.args.run_mode,
                        jid=self.ctx.args.job_id)
         self.pod = Pod()
 
@@ -65,18 +65,51 @@ class ControllerBase(object):
         self.watch()
 
     def watch(self) -> bool:
+        '''
+        watch self and peer status, return true to exit
+        '''
+        #TODO(kuizhiqing) unify ctx.status and master status
+
         self.ctx.logger.info("Watching {}".format(self.pod))
 
-        status = self.pod.watch()
+        while not self.ctx.status.is_done():
+            status = self.pod.watch(timeout=2)
+
+            if self.ctx.continous_log():
+                self.pod.logs()
+
+            # completed
+            if status == self.ctx.status.COMPLETED:
+                self.ctx.status.complete()
+
+                self.master.set_status(status)
+
+                self.ctx.logger.info("Pod {}".format(status))
+                return True
+
+            # self failure
+            elif status == self.ctx.status.FAILED:
+                self.ctx.status.fail()
+
+                self.master.set_status(status)
+                self.master.restart_peer()
+
+                fc = self.pod.failed_container()
+                self.ctx.logger.info("Pod {}".format(status))
+                self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
+                fc[0].tail()
+                self.pod.stop()
+
+                if self.ctx.args.elastic_level <= 0:
+                    return True
+                else:
+                    return False
 
-        if status == self.ctx.status.COMPLETED:
-            self.ctx.logger.info("Pod {}".format(status))
-        elif status == self.ctx.status.FAILED:
-            fc = self.pod.failed_container()
-            self.ctx.logger.info("Pod {}".format(status))
-            self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
-            fc[0].tail()
-            self.pod.stop()
+            # peer failure
+            if self.ctx.status.is_restarting() and self.master.get_status(
+            ) != self.ctx.status.COMPLETED:
+                self.pod.stop()
+                return False
 
     def stop(self, sigint=None):
         self.ctx.logger.debug("Controller stop")
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
index f9f484eb125eed264cb7e8658476255281a048cf..43eda4cdffa24e3f7e292f24274a87fa8b3d7fdb 100644
--- a/python/paddle/distributed/launch/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -43,6 +43,15 @@ class Master(object):
     def stop(self):
         raise NotImplementedError
 
+    def set_status(self, status):
+        pass
+
+    def get_status(self):
+        return None
+
+    def restart_peer(self):
+        pass
+
     def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
         raise NotImplementedError
 
@@ -122,7 +131,7 @@ class HTTPMaster(Master):
         if size < 2:
             return [value], 0
 
-        self.ctx.logger.info("Waiting peer ready...")
+        self.ctx.logger.info("Waiting peer start...")
 
         self.lazy_init()
 
@@ -184,7 +193,7 @@ class ETCDMaster(Master):
         if size < 2:
             return [value], 0
 
-        self.ctx.logger.info("Waiting peer ready...")
+        self.ctx.logger.info("Waiting peer start...")
 
         path = "{}/{}/{}".format(prefix, key, rank)
 
diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
index d3d0ef59bfd2f8102fe92beecef100c2079b1698..6504f1240ee091eeffc0dfba39b38a393fea8fec 100644
--- a/python/paddle/distributed/launch/controllers/ps.py
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -21,11 +21,11 @@ import os, shutil
 class PSController(Controller):
     @classmethod
     def enable(cls, ctx):
-        if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
+        if ctx.args.run_mode == ControleMode.PS or ctx.args.server_num or len(
                 ctx.args.servers) > 0 or ctx.args.trainer_num or len(
                     ctx.args.trainers) > 0:
             ctx.logger.debug("{} enabled".format(cls.__name__))
-            ctx.args.mode = ControleMode.PS
+            ctx.args.run_mode = ControleMode.PS
             return True
         else:
             return False
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6febff505e5248e6fc908c62293db2461b3eb32
--- /dev/null
+++ b/python/paddle/distributed/launch/main.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .context import Context
+
+
+def launch():
+    """
+    Paddle distribution training entry ``python -m paddle.distributed.launch``.
+    
+    Usage:
+        .. code-block:: bash
+            :name: code-block-bash1
+
+            python -m paddle.distributed.launch [-h] [--master MASTER] [--rank RANK]
+                   [--log_level LOG_LEVEL] [--nnodes NNODES]
+                   [--nproc_per_node NPROC_PER_NODE] [--log_dir LOG_DIR]
+                   [--run_mode RUN_MODE] [--job_id JOB_ID] [--devices DEVICES]
+                   [--host HOST] [--servers SERVERS] [--trainers TRAINERS]
+                   [--trainer_num TRAINER_NUM] [--server_num SERVER_NUM]
+                   [--gloo_port GLOO_PORT] [--with_gloo WITH_GLOO]
+                   [--max_restart MAX_RESTART] [--elastic_level ELASTIC_LEVEL]
+                   [--elastic_timeout ELASTIC_TIMEOUT]
+                   training_script ...
+
+
+    Base Parameters:
+        - ``--master``: The master/rendezvous server, support http:// and etcd://, default with http://. e.g., ``--master=127.0.0.1:8080``. Default ``--log_dir=None``.
+
+        - ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.
+
+        - ``--log_level``: The log levl to set for logging.setLevel. Default ``--log_level=INFO``.
+
+        - ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnnodes=2:3``. Default ``--nnodes=1``.
+
+        - ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system.  e.g., ``--nproc_per_node=8``
+
+        - ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.
+
+        - ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.
+
+        - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
+
+        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
+
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+
+        - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
+
+    Collective Parameters:
+        - ``--ips``: [DEPRECATED] Paddle cluster nodes ips, e.g., ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.
+
+    Parameter-Server Parameters:
+        - ``--servers``: User defined servers ip:port, e.g., ``--servers="192.168.0.16:6170,192.168.0.17:6170"``
+
+        - ``--trainers``: User defined trainers ip:port, e.g., ``--trainers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``
+
+        - ``--workers``: [DEPRECATED] The same as trainers.
+
+        - ``--trainer_num``: Number of trainers on each node, can be 0.
+
+        - ``--worker_num``: [DEPRECATED] The same as trainer_num.
+
+        - ``--server_num``: Number of servers on each node, can be 0.
+
+        - ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``
+
+        - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
+        
+        - ``--heter_devices``: Type of heter_device in each stage
+
+        - ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.
+
+        - ``--with_gloo``: Using gloo or not. Default ``--with_gloo=0``.
+
+    Elastic Parameters:
+        - ``--max_restart``: The maximum restart times for an elastic job. Default ``--max_restart=3``.
+
+        - ``--elastic_level``: The elastic level: -1: disable, 0: failed exit, peers hold, 1: internal restart. Default ``--elastic_level=-1``.
+
+        - ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``.
+
+
+    Returns:
+        ``None``
+
+    Examples 0 (master, ip/port auto detection):
+
+            # For training on multi node, run the following command in one of the nodes
+
+            python -m paddle.distributed.launch --nnodes 2 train.py
+
+            # Then the following info will be print
+
+            # Copy the following command to other nodes to run.
+            # --------------------------------------------------------------------------------
+            # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
+            # --------------------------------------------------------------------------------
+
+            # Follow the instruction above and paste the command in other nodes can launch a multi nodes training job.
+
+            # There are two ways to launch a job with the same command for multi nodes training
+            # 1) using the following command in every nodes, make sure the ip is one of the training node and the port is available on that node
+            # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
+            # 2) using the following command in every nodes with a independent etcd service
+            # python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2 train.py
+
+            # This functionality works will for both collective and ps mode and even with other arguments.
+
+
+    Examples 1 (collective, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash1
+            
+            # For training on single node using 4 gpus.
+
+            python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01
+        
+    Examples 2 (collective, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash2
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 
+
+            # On 192.168.0.16:
+
+            python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
+
+            # On 192.168.0.17:
+            python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
+        
+    Examples 3 (ps, cpu, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash3
+
+            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
+            
+            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
+        
+    Examples 4 (ps, cpu, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash4
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
+
+            # On 192.168.0.16:
+
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+            # On 192.168.0.17:
+
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+            # Or with master, the following command run 2 server and 2 trainer on each node.
+
+            python -m paddle.distributed.launch --master 192.168.0.16:9090 --server_num=2 --trainer_num=2 --nnodes 2 train.py
+
+
+    Examples 5 (ps, gpu, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash5
+
+           # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
+            
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
+            
+    Examples 6 (ps, gpu, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash6
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
+
+            # On 192.168.0.16:
+
+            export CUDA_VISIBLE_DEVICES=0,1
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+            # On 192.168.0.17:
+
+            export CUDA_VISIBLE_DEVICES=0,1
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+    Examples 7 (ps-heter, cpu + gpu, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash7
+
+            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
+            
+            export CUDA_VISIBLE_DEVICES=0,1
+            python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
+            
+    Examples 8 (ps-heter, cpu + gpu, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash8
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server, 1 gpu worker, 1 cpu worker.
+
+            # On 192.168.0.16:
+
+            export CUDA_VISIBLE_DEVICES=0
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
+
+            # On 192.168.0.17:
+
+            export CUDA_VISIBLE_DEVICES=0
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
+
+    Examples 9 (elastic):
+        .. code-block:: bash
+            :name: code-block-example-bash9
+
+            # With the following command, the job will begin to run immediately if 4 nodes are ready,
+            # or it will run after elastic_timeout if only 2 or 3 nodes ready
+            python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py
+            
+            # once the number of nodes changes between 2:4 during training, the strategy holds
+        
+    """
+
+    # initialize the context to run
+    ctx = Context()
+
+    if ctx.is_legacy_mode():
+
+        # legacy mode
+        from paddle.distributed.fleet import launch
+        launch.launch()
+
+    else:
+
+        from . import controllers
+
+        # initialize the selected controller
+        c = controllers.init(ctx)
+
+        # run the pods
+        c.run()
+
+        # manager or just wait pod
+        c.finalize()
+
+
+if __name__ == "__main__":
+    launch()
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 5385ac28b90f614fcd6003994b9a7000bc16702a..da66530f81b0a50ad432f72a10eeee354127c53a 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -30,6 +30,7 @@ from paddle.fluid.framework import _set_expected_place, _current_expected_place,
 import queue
 
 import paddle
+import paddle.profiler as profiler
 from .. import core, layers
 from ..framework import in_dygraph_mode, _in_eager_mode
 from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar
@@ -250,6 +251,10 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         self._exit_thread_expectedly()
 
     def __next__(self):
+        trace_event = profiler.RecordEvent(
+            name="_DataLoaderIterSingleProcess",
+            event_type=profiler.TracerEventType.Dataloader)
+        trace_event.begin()
         try:
             if in_dygraph_mode():
                 if _in_eager_mode():
@@ -283,6 +288,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
             self._reader.shutdown()
             self._try_shutdown_all()
             six.reraise(*sys.exc_info())
+        finally:
+            trace_event.end()
 
     def _shutdown_thread(self):
         if self._thread:
@@ -695,6 +702,10 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._try_shutdown_all(1)
 
     def __next__(self):
+        trace_event = profiler.RecordEvent(
+            name="_DataLoaderIterMultiProcess",
+            event_type=profiler.TracerEventType.Dataloader)
+        trace_event.begin()
         try:
             # _batches_outstanding here record the total batch data number
             # in 'from after _try_put_indices to beforeoutput data', this
@@ -743,6 +754,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                 self._reader.shutdown()
                 self._try_shutdown_all()
             six.reraise(*sys.exc_info())
+        finally:
+            trace_event.end()
 
     # python2 compatibility
     def next(self):
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index f4334085620f510e3d520f89332b754a93aa120a..37db9f8fce77a63773223888c8896822d56ba1e4 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -25,6 +25,7 @@ from copy import deepcopy
 import inspect
 
 import paddle
+import paddle.profiler as profiler
 
 from . import parallel_helper
 from .. import unique_name
@@ -905,7 +906,9 @@ class Layer(object):
 
             self._built = True
 
-        outputs = self.forward(*inputs, **kwargs)
+        with profiler.RecordEvent(self.full_name(),
+                                  profiler.TracerEventType.Forward):
+            outputs = self.forward(*inputs, **kwargs)
 
         for forward_post_hook in self._forward_post_hooks.values():
             hook_result = forward_post_hook(self, inputs, outputs)
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 5bb1aef6d6e9b96a8492fe9fc76c7448a053e3bf..b41e3e0b502b591fe0c86fa2a48b99402cba68fe 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -2986,6 +2986,12 @@ class GroupNorm(layers.Layer):
             is_bias=True)
 
     def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('epsilon', self._epsilon, 'groups', self._groups)
+            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias, *attrs)
+
+            return dygraph_utils._append_activation_in_dygraph(out, self._act)
+
         inputs = {'X': input}
         if self.bias is not None:
             inputs['Bias'] = self.bias
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index af30b2b2444b44f1b27e8f277eb380557255517d..2ca923f8634878c7a110dd7fc711459295a42427 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -28,6 +28,7 @@ from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 import paddle.utils.deprecated as deprecated
+import paddle.profiler as profiler
 from paddle import _C_ops
 
 
@@ -199,8 +200,8 @@ def monkey_patch_varbase():
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
-            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
-            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None,
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0;
             if `grad_tensor` is not None, it must have the same length as the current Tensor.
             Teh default value is None.
 
@@ -243,6 +244,9 @@ def monkey_patch_varbase():
 
         """
         if framework.in_dygraph_mode():
+            record_event = profiler.RecordEvent(
+                "Gradient Backward", profiler.TracerEventType.Backward)
+            record_event.begin()
             if grad_tensor is not None:
                 if core._in_eager_mode():
                     assert isinstance(
@@ -278,6 +282,7 @@ def monkey_patch_varbase():
                     core.dygraph_run_backward([self], [grad_tensor],
                                               retain_graph,
                                               framework._dygraph_tracer())
+            record_event.end()
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -476,7 +481,7 @@ def monkey_patch_varbase():
     def grad(self):
         """
         .. warning::
-          This API will return the tensor value of the gradient. If you want 
+          This API will return the tensor value of the gradient. If you want
           to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`.
 
         Get the Gradient of Current Tensor.
@@ -515,7 +520,7 @@ def monkey_patch_varbase():
 
     def item(self, *args):
         """
-        Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a 
+        Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a
         single-element Tensor.
 
         Args:
@@ -526,7 +531,7 @@ def monkey_patch_varbase():
 
         Raises:
             ValueError: If the Tensor has more than one element, there must be coordinates.
-        
+
         Examples:
             .. code-block:: python
 
@@ -588,7 +593,7 @@ def monkey_patch_varbase():
                 import paddle
                 x = paddle.rand([2, 5])
                 print(x)
-                
+
                 # Tensor(shape=[2, 5], dtype=float32, place=CPUPlace,
                 #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
@@ -611,7 +616,7 @@ def monkey_patch_varbase():
                 import copy
                 x = paddle.to_tensor(2.)
                 y = copy.deepcopy(x)
-                
+
                 print(x)
                 # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True,
                 #        [2.])
@@ -655,7 +660,7 @@ def monkey_patch_varbase():
     def __array__(self, dtype=None):
         """
         Returns a numpy array shows the value of current Tensor.
-        
+
         Returns:
             ndarray: The numpy value of current Tensor.
 
@@ -763,8 +768,11 @@ def monkey_patch_varbase():
             return _setitem_impl_(self, item, value)
 
         else:
-            # Call c++ func __setitem_varbase__ to speedup.
-            return self.__setitem_varbase__(item, value)
+            if core._in_eager_mode():
+                return self.__setitem_eager_tensor__(item, value)
+            else:
+                # Call c++ func __setitem_varbase__ to speedup.
+                return self.__setitem_varbase__(item, value)
 
     @framework.dygraph_only
     def _grad_ivar(self):
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index fff1a2270a5f293cb343d3ec958cdd98e37d8ac4..3226a8a6f016bd97dcb101861a11d7ff0da9c8b7 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -270,9 +270,10 @@ def generate_activation_fn(op_type):
                                      op_type)
         else:
             # abs exp square ops support dtype(int32, int64, float16, float32, float64)
-            check_variable_and_dtype(
-                x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-                op_type)
+            check_variable_and_dtype(x, 'x', [
+                'int32', 'int64', 'float16', 'float32', 'float64', 'complex64',
+                'complex128'
+            ], op_type)
 
         helper = LayerHelper(op_type, **locals())
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 221aba9a882e5d92a463fb6cc027a6915e6ca82b..5a4ea27dbcf934e48abc09e23a244371b6801bf5 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5616,9 +5616,10 @@ def transpose(x, perm, name=None):
         out, _ = _C_ops.transpose2(x, 'axis', perm)
         return out
 
-    check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'transpose')
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], 'transpose')
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -6410,10 +6411,10 @@ def squeeze(input, axes, name=None):
         return out
 
     helper = LayerHelper("squeeze", **locals())
-    check_variable_and_dtype(
-        input, 'input',
-        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
-        'squeeze')
+    check_variable_and_dtype(input, 'input', [
+        'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
+        'complex64', 'complex128'
+    ], 'squeeze')
     check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -6471,8 +6472,16 @@ def unsqueeze(input, axes, name=None):
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
     check_variable_and_dtype(input, 'input', [
-        'float16', 'float32', 'float64', 'bool', 'int8', 'int16', 'int32',
-        'int64'
+        'float16',
+        'float32',
+        'float64',
+        'bool',
+        'int8',
+        'int16',
+        'int32',
+        'int64',
+        'complex64',
+        'complex128',
     ], 'unsqueeze')
     helper = LayerHelper("unsqueeze2", **locals())
     inputs = {"X": input}
@@ -11180,8 +11189,8 @@ def slice(input, axes, starts, ends):
             ends_tensor.stop_gradient = True
             infer_flags = list(-1 for i in range(len(axes)))
 
-        return _C_ops.slice(input, starts_tensor, ends_tensor, 'axes', axes,
-                            'infer_flags', infer_flags, *attrs)
+        return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
+                            'axes', axes, 'infer_flags', infer_flags, *attrs)
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c63ad42288fd057d8456f31a675c9f1912bdc12f..c5accd9ada8f7db14a1b32fdacb85c3ef9525482 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -632,7 +632,7 @@ def assign(input, output=None):
             dtype = VarDesc.VarType.FP32
         if dtype == VarDesc.VarType.BOOL:
             value_name = "bool_values"
-            values = [bool(v) for v in input.flat]
+            values = [int(v) for v in input.flat]
         elif dtype == VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in input.flat]
@@ -756,7 +756,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     check_shape(shape)
     check_dtype(dtype, 'dtype', [
         'bool', 'float16', 'float32', 'float64', 'uint8', 'int16', 'int32',
-        'int64'
+        'int64', 'complex64', 'complex128'
     ], 'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 183a00bd70bdff1ec37767f06a5a3944aa9882e8..4d39d38853063616bced2b76f86c3f8e9b66aa48 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -20,6 +20,8 @@ import os
 import six
 import sys
 
+from paddle.utils.deprecated import deprecated
+
 __all__ = [
     'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
     'stop_profiler'
@@ -36,10 +38,16 @@ NVPROF_CONFIG = [
 ]
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 @signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
     """
-    API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`. 
+    API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`.
     The relevant reference documents are as follows:
     <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler>
     <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler>
@@ -54,18 +62,18 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 def npu_profiler(output_file, config=None):
     """
     The NPU profiler.
-    
+
     This fuctions is used to profile NPU program by NPU runtime application
     programming interface. The profiling result will be written into
-    `output_file`. The users can set set the NPU profiling config by `config` argument. 
-    
-    After getting the profiling result file, users can use 
-    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ 
+    `output_file`. The users can set set the NPU profiling config by `config` argument.
+
+    After getting the profiling result file, users can use
+    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
     to load this output file to visualize results.
 
     Args:
         output_file (str) : The output file name, the result will be
-            written into this file. It should be absolute path. 
+            written into this file. It should be absolute path.
         config (list<str>, optional) : NPU profile config. For more details, please
             refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
 
@@ -109,6 +117,12 @@ def npu_profiler(output_file, config=None):
         core.npu_prof_finalize()
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 def reset_profiler():
     """
     Clear the previous time record. It works for
@@ -131,31 +145,38 @@ def reset_profiler():
     core.reset_profiler()
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 def start_profiler(state, tracer_option='Default'):
     """
     Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage
     of `fluid.profiler.profiler` interface.
 
     Args:
         state (str) : The profiling state, which should be one of 'CPU', 'GPU'
             or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and
             generates timeline as well.
         tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
-            can control the profile level and print the different level profile result. `Default` option print 
-            the different Op type profiling result and the `OpDetail` option print the detail profiling 
-            result of different op types such as compute and data transform, `AllOpDetail` option 
+            can control the profile level and print the different level profile result. `Default` option print
+            the different Op type profiling result and the `OpDetail` option print the detail profiling
+            result of different op types such as compute and data transform, `AllOpDetail` option
             print the detail profiling result of different op name same as `OpDetail`.
 
     Raises:
-        ValueError: If `state` is not in ['CPU', 'GPU', 'All'] or `tracer_option` 
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All'] or `tracer_option`
             is not in ['Default', 'OpDetail', 'AllOpDetail'].
 
     Examples:
 
         .. code-block:: python
 
+            # required: gpu
             import paddle.fluid as fluid
             import paddle.fluid.profiler as profiler
 
@@ -165,7 +186,7 @@ def start_profiler(state, tracer_option='Default'):
                     profiler.reset_profiler()
                 # except each iteration
             profiler.stop_profiler('total', '/tmp/profile')
-            
+
             profiler.start_profiler('GPU', "OpDetail")
             for iter in range(10):
                 if iter == 2:
@@ -198,14 +219,20 @@ def start_profiler(state, tracer_option='Default'):
     core.enable_profiler(prof_state)
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     """
     Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage
     of `fluid.profiler.profiler` interface.
 
     Args:
-        sorted_key (str, optional) : The order of profiling results, which 
+        sorted_key (str, optional) : The order of profiling results, which
             should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
             Default is None, means the profiling results will be printed
             in the order of first end time of events.
@@ -214,7 +241,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
             The `max` means sorting by the maximum execution time.
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
-            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`.
         profile_path (str, optional) : If state == 'All', it will generate timeline,
 
     Raises:
@@ -225,6 +252,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 
         .. code-block:: python
 
+            # required: gpu
             import paddle.fluid as fluid
             import paddle.fluid.profiler as profiler
 
@@ -254,6 +282,12 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     core.disable_profiler(key_map[sorted_key], profile_path)
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 @signature_safe_contextmanager
 def profiler(state,
              sorted_key=None,
@@ -265,9 +299,9 @@ def profiler(state,
     Args:
         state (str) : The profiling state, which should be one of 'CPU', 'GPU'
             or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and
             generates timeline as well.
-        sorted_key (str, optional) : The order of profiling results, which 
+        sorted_key (str, optional) : The order of profiling results, which
             should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
             Default is None, means the profiling results will be printed
             in the order of first end time of events.
@@ -277,11 +311,11 @@ def profiler(state,
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
         profile_path (str, optional) : If state == 'All', it will generate timeline,
-            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`.
         tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
-            can control the profile level and print the different level profile result. `Default` option print 
-            the different Op type profiling result and the `OpDetail` option print the detail profiling 
-            result of different op types such as compute and data transform, `AllOpDetail` option 
+            can control the profile level and print the different level profile result. `Default` option print
+            the different Op type profiling result and the `OpDetail` option print the detail profiling
+            result of different op types such as compute and data transform, `AllOpDetail` option
             print the detail profiling result of different op name same as `OpDetail`.
 
     Raises:
@@ -319,7 +353,7 @@ def profiler(state,
 
             #### Examples Results ####
             #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
-            # The only difference in 5 sorted_key results is the following sentence: 
+            # The only difference in 5 sorted_key results is the following sentence:
             # "Sorted by number of xxx in descending order in the same thread."
             # The reason is that in this example, above 5 columns are already sorted.
             ------------------------->     Profiling Report     <-------------------------
@@ -339,7 +373,7 @@ def profiler(state,
 
             #### 2) sorted_key = None  ####
             # Since the profiling results are printed in the order of first end time of Ops,
-            # the printed order is feed->conv2d->elementwise_add 
+            # the printed order is feed->conv2d->elementwise_add
             ------------------------->     Profiling Report     <-------------------------
 
             Place: CPU
@@ -366,7 +400,7 @@ def _nvprof_range(iter_id, start, end, exit_after_prof=True):
     Examples:
 
         .. code-block:: python
-            
+
             model = Model()
             for i in range(max_iter):
                 paddle.fluid.profiler._nvprof_range(i, 10, 20):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
index d51976e1a1962d830a9f08021fcd7c3cafc174bf..71d4b45e61b186f4afcc798458af28e17722b9c1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
@@ -71,7 +71,7 @@ class TestAssignValueNPUOp4(TestAssignValueNPUOp):
     def init_data(self):
         self.value = numpy.random.choice(
             a=[False, True], size=(2, 5)).astype(numpy.bool)
-        self.attrs["bool_values"] = [bool(v) for v in self.value.flat]
+        self.attrs["bool_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignApi(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
index adf238c43d21a8a6ca80bf27f57fb6cf96bb0467..2abdbdc5940f72640979716ec5ba09812248a452 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -58,7 +58,7 @@ class TestAssignValueOp4(TestAssignValueOp):
     def init_data(self):
         self.value = numpy.random.choice(
             a=[False, True], size=(2, 5)).astype(numpy.bool)
-        self.attrs["bool_values"] = [bool(v) for v in self.value.flat]
+        self.attrs["bool_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignApi(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index a43e56b0815a69d5f575df11092c0d1231d07cb1..a86758a9cb92b6ce5ffffb4bdbe4d8cff3014e78 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -24,6 +24,7 @@ from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 class ElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
+        self.python_api = paddle.divide
         self.dtype = np.float64
         self.init_dtype()
         """ Warning
@@ -37,8 +38,11 @@ class ElementwiseDivOp(OpTest):
         }
         self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
+    def check_eager(self):
+        return (self.use_mkldnn == False and self.axis == -1)
+
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 44d73612b1cb5e0bce1e561ab25884355b083d77..39b79dd4ba26b6b375404d452789758dddeba89d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -182,7 +182,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
         self.func_auto_prune2()
 
     # TODO(jiabin): Support this when we support better split tensor
-    def test_auto_prune3(self):
+    def func_auto_prune3(self):
         with fluid.dygraph.guard():
             case3 = AutoPruneLayer3(input_size=784)
             value1 = np.arange(784).reshape(1, 784).astype("float32")
@@ -194,7 +194,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
             self.assertTrue(case3.linear.weight._grad_ivar() is not None)
             self.assertTrue((part2.gradient() == 0).all())
 
-    def test_auto_prune4(self):
+    def test_auto_prune3(self):
+        with _test_eager_guard():
+            self.func_auto_prune3()
+        self.func_auto_prune3()
+
+    def func_auto_prune4(self):
         with fluid.dygraph.guard():
             case4 = AutoPruneLayer3(input_size=784)
             value1 = np.arange(784).reshape(1, 784).astype("float32")
@@ -206,7 +211,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
             self.assertTrue(case4.linear.weight._grad_ivar() is not None)
             self.assertTrue((part2.gradient() == 1).all())
 
-    def test_auto_prune5(self):
+    def test_auto_prune4(self):
+        with _test_eager_guard():
+            self.func_auto_prune4()
+        self.func_auto_prune4()
+
+    def func_auto_prune5(self):
         with fluid.dygraph.guard():
             case4 = AutoPruneLayer3(input_size=784)
             value1 = np.arange(784).reshape(1, 784).astype("float32")
@@ -218,6 +228,11 @@ class TestImperativeAutoPrune(unittest.TestCase):
             self.assertTrue(case4.linear.weight._grad_ivar() is not None)
             self.assertTrue((part2.gradient() == 0).all())
 
+    def test_auto_prune5(self):
+        with _test_eager_guard():
+            self.func_auto_prune5()
+        self.func_auto_prune5()
+
     def func_auto_prune6(self):
         with fluid.dygraph.guard():
             value0 = np.arange(26).reshape(2, 13).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 36038d656b7736afc94da32c29c56ce61b338cb4..bb244a20bd873d34c6f01a4ec5a8b87018d71668 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1819,7 +1819,7 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
-    def test_group_norm(self):
+    def func_group_norm(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
         else:
@@ -1873,7 +1873,6 @@ class TestLayer(LayerTest):
                 with_lod=True)[0]
 
         with self.dynamic_graph():
-            # TODO(wuweilong): Add with _test_eager_guard():
             groupNorm = nn.GroupNorm(
                 channels=shape[1],
                 groups=2,
@@ -1886,6 +1885,11 @@ class TestLayer(LayerTest):
         self.assertTrue(np.allclose(static_ret, dy_rlt_value))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
+    def test_group_norm(self):
+        with _test_eager_guard():
+            self.func_group_norm()
+        self.func_group_norm()
+
     def test_instance_norm(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
@@ -2348,7 +2352,7 @@ class TestLayer(LayerTest):
         with self.assertRaises(TypeError):
             layers.eye(num_rows=3, batch_shape=[-1])
 
-    def test_while_loop(self):
+    def func_while_loop(self):
         with self.static_graph():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -2363,7 +2367,6 @@ class TestLayer(LayerTest):
             static_ret = self.get_static_graph_result(feed={}, fetch_list=out)
 
         with self.dynamic_graph():
-            # TODO(wuweilong): Add with _test_eager_guard():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
 
@@ -2384,6 +2387,11 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.array_equal(static_ret[0], dy_ret[0].numpy()))
 
+    def test_while_loop(self):
+        with _test_eager_guard():
+            self.func_while_loop()
+        self.func_while_loop()
+
     def test_compare(self):
         value_a = np.arange(3)
         value_b = np.arange(3)
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index 81e8a168d98b3dc24d7b24cca4598edf1c36e2ca..d02f961c0d5d5b66d62a3fdf9e79b3960ab03cb0 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -21,6 +21,7 @@ from paddle.fluid import core
 from op_test import OpTest
 import numpy as np
 from paddle.fluid.framework import _test_eager_guard
+import os
 
 
 def sample_output_one_dimension(out, dim):
@@ -250,6 +251,60 @@ class TestMultinomialError(unittest.TestCase):
         self.assertRaises(ValueError, test_dim_less_than_1)
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Different GPU generatte different random value. Only test V100 here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on V100 GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(100)
+
+        x = paddle.randint(0, 100, [1024, 10000]).astype('float32')
+        y = paddle.multinomial(x, 1, replacement=False).numpy()
+        self.assertEqual(np.sum(y), 5187793)
+        self.assertEqual(np.mean(y), 5066.2041015625)
+        expect = [9982, 1655, 4741, 1323, 9319, 3298, 6473, 7477, 2507, 2628]
+        self.assertTrue(np.array_equal(y[100:110, :].flatten(), expect))
+
+        y = paddle.multinomial(x, 5000, replacement=False).numpy()
+        self.assertEqual(np.sum(y), 25603962316)
+        self.assertEqual(np.mean(y), 5000.77388984375)
+        expect = [7300, 6055, 8714, 5401, 7360, 161, 5035, 7002, 6788, 2916]
+        self.assertTrue(np.array_equal(y[100, 1000:1010], expect))
+
+        y = paddle.multinomial(x, 5000, replacement=False).numpy()
+        self.assertEqual(np.sum(y), 25592855710)
+        self.assertEqual(np.mean(y), 4998.604630859375)
+        expect = [5700, 6567, 4399, 5688, 7472, 545, 6894, 526, 2124, 385]
+        self.assertTrue(np.array_equal(y[300, 3000:3010], expect))
+
+        y = paddle.multinomial(x, 20000, replacement=True).numpy()
+        self.assertEqual(np.sum(y), 102371362581)
+        self.assertEqual(np.mean(y), 4998.60168852539)
+        self.assertEqual(np.std(y), 2886.316308500771)
+        expect = [7630, 8235, 8445, 3275, 5580, 4591, 1331, 342, 1662, 7156]
+        self.assertTrue(np.array_equal(y[100, 0:10], expect))
+
+        y = paddle.multinomial(x, 20000, replacement=True).numpy()
+        self.assertEqual(np.sum(y), 102400672117)
+        self.assertEqual(np.mean(y), 5000.032818212891)
+        self.assertEqual(np.std(y), 2886.913426124017)
+        expect = [4159, 7849, 9305, 5759, 4422, 122, 345, 2897, 5200, 5911]
+        self.assertTrue(np.array_equal(y[100, 0:10], expect))
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index 838ccae37cfa5fb7dbdedcb5d39655cb62ad429f..73b501c9c7eade28e94281b6d07ce21140b72c53 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -56,7 +56,15 @@ class TestProfilerStatistic(unittest.TestCase):
         mobilenet_node = HostPythonNode(
             'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
         yolonet_node = HostPythonNode(
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+
+        userdefined_node = HostPythonNode('Communication Time',
+                                          profiler.TracerEventType.UserDefined,
+                                          100, 110, 1000, 1001)
+
+        communication_node = HostPythonNode(
+            'Communication', profiler.TracerEventType.Communication, 105, 110,
+            1000, 1001)
         backward_node = HostPythonNode('Gradient Backward',
                                        profiler.TracerEventType.Backward, 120,
                                        200, 1000, 1001)
@@ -114,7 +122,9 @@ class TestProfilerStatistic(unittest.TestCase):
             optimization_node
         ])
         mobilenet_node.children_node.append(conv2d_node)
-        yolonet_node.children_node.append(sync_batch_norm_node)
+        yolonet_node.children_node.extend(
+            [sync_batch_norm_node, userdefined_node])
+        userdefined_node.children_node.append(communication_node)
         conv2d_node.children_node.extend(
             [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
         conv2d_compute.runtime_node.append(conv2d_launchkernel)
@@ -145,7 +155,7 @@ class TestProfilerStatistic(unittest.TestCase):
                 profiler.TracerEventType.ProfileStep), 400)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.Forward), 90)
+                profiler.TracerEventType.Forward), 100)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Backward), 80)
@@ -169,15 +179,18 @@ class TestProfilerStatistic(unittest.TestCase):
                 0, profiler.TracerEventType.Memcpy), 60)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.UserDefined), 15)
+                profiler.TracerEventType.UserDefined), 25)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Communication), 5)
         self.assertEqual(len(event_summary.items), 2)
-        self.assertEqual(len(event_summary.userdefined_items), 0)
+        self.assertEqual(len(event_summary.userdefined_items), 1)
         self.assertEqual(len(event_summary.model_perspective_items), 3)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
         self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].cpu_time, 90)
+            event_summary.model_perspective_items['Forward'].cpu_time, 100)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].gpu_time, 135)
         self.assertEqual(
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index 498aecf7c6e75cbb3dd1e73e4aae6151ed65f0db..a2f12fbf5809ba9f026b4160754e850f96182df6 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -116,7 +116,7 @@ class PS_Test(unittest.TestCase):
         return proc
 
     def test_ps_1(self):
-        args = "--mode ps"
+        args = "--run_mode ps"
         p = self.pdrun(args)
         p.wait()
         self.assertTrue(p.poll() == 0)
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 42225468bc41c80bb9f095f3742a24fadfa045f4..f7b145d358ec9d725447d07b133c5b06bb34da56 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -22,6 +22,7 @@ import numpy as np
 import paddle
 from paddle.fluid.layer_helper import LayerHelper
 from functools import reduce
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class TestSetValueBase(unittest.TestCase):
@@ -69,7 +70,7 @@ class TestSetValueApi(TestSetValueBase):
         paddle.enable_static()
         return out
 
-    def test_api(self):
+    def func_test_api(self):
         static_out = self._run_static()
         dynamic_out = self._run_dynamic()
         self._get_answer()
@@ -82,6 +83,11 @@ class TestSetValueApi(TestSetValueBase):
             (self.data == dynamic_out).all(),
             msg=error_msg.format("dynamic", self.data, dynamic_out))
 
+    def test_api(self):
+        with _test_eager_guard():
+            self.func_test_api()
+        self.func_test_api()
+
 
 # 1. Test different type of item: int, Python slice, Paddle Tensor
 # 1.1 item is int
@@ -995,9 +1001,9 @@ class TestBackward(unittest.TestCase):
             fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
 
         self.assertTrue((var_grad == z_grad[0, :]).all())
-
-    def test_dynamic(self):
         paddle.disable_static()
+
+    def func_test_dynamic(self):
         model = Model()
         x = paddle.ones([1, 12, 3, 3]).astype("float32")
         y = paddle.ones([1, 12, 3, 3]).astype("float32")
@@ -1006,11 +1012,18 @@ class TestBackward(unittest.TestCase):
 
         self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
         # 
-        self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
+        # TODO(pangyoki) add inplace and delete if
+        if not _in_eager_mode():
+            self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
+
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_test_dynamic()
+        self.func_test_dynamic()
 
 
 class TestGradientTruncated(unittest.TestCase):
-    def test_consistent_with_competitor(self):
+    def func_test_consistent_with_competitor(self):
         paddle.disable_static()
 
         def set_value(t, value):
@@ -1182,6 +1195,11 @@ class TestGradientTruncated(unittest.TestCase):
         self.assertTrue(~x.stop_gradient)
         self.assertTrue(~x.is_leaf)
 
+    def test_consistent_with_competitor(self):
+        with _test_eager_guard():
+            self.func_test_consistent_with_competitor()
+        self.func_test_consistent_with_competitor()
+
     def test_static_graph(self):
         paddle.enable_static()
 
@@ -1328,6 +1346,7 @@ class TestGradientTruncated(unittest.TestCase):
                 self.assertTrue((numel(out1[0][0:5:3].shape) == out3[0]).all())
 
             array = array[0]
+        paddle.disable_static()
 
 
 class TestSetValueInplace(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_stft_op.py b/python/paddle/fluid/tests/unittests/test_stft_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..64b8084a1651f156dfdd12606df81e69dfa256ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_stft_op.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.lib.stride_tricks import as_strided
+import paddle
+import unittest
+
+from op_test import OpTest
+
+
+def frame_from_librosa(x, frame_length, hop_length, axis=-1):
+    if axis == -1 and not x.flags["C_CONTIGUOUS"]:
+        x = np.ascontiguousarray(x)
+    elif axis == 0 and not x.flags["F_CONTIGUOUS"]:
+        x = np.asfortranarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * x.itemsize]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * x.itemsize] + list(strides)
+
+    else:
+        raise ValueError("Frame axis={} must be either 0 or -1".format(axis))
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def stft_np(x, n_fft, hop_length, **kwargs):
+    frames = frame_from_librosa(x, n_fft, hop_length)
+    res = np.fft.rfft(frames, axis=1)
+    return res
+
+
+class TestStftOp(OpTest):
+    def setUp(self):
+        self.op_type = "stft"
+        self.shape, self.type, self.attrs = self.initTestCase()
+        self.inputs = {
+            'X': np.random.random(size=self.shape).astype(self.type),
+        }
+        self.outputs = {'Out': stft_np(x=self.inputs['X'], **self.attrs)}
+
+    def initTestCase(self):
+        input_shape = (2, 100)
+        input_type = 'float64'
+        attrs = {
+            'n_fft': 50,
+            'hop_length': 15,
+            'normalized': False,
+            'onesided': True,
+        }
+        return input_shape, input_type, attrs
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+        paddle.disable_static()
+
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out')
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 57a7f94bedce9fb3cd9981e6ae21f6d902fd04d9..4b3e935426f9f1d79b91b7301b5cd6725960fb54 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -22,6 +22,7 @@ import copy
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class TestVarBase(unittest.TestCase):
@@ -874,7 +875,7 @@ class TestVarBase(unittest.TestCase):
         col = np.array([2, 1, 3])
         self.assertTrue(np.array_equal(array[row, col], x[row, col].numpy()))
 
-    def test_slice(self):
+    def func_test_slice(self):
         with fluid.dygraph.guard():
             self._test_slice()
             self._test_slice_for_tensor_attr()
@@ -899,6 +900,11 @@ class TestVarBase(unittest.TestCase):
                 mask = np.array([1, 0, 1, 0], dtype=bool)
                 var[paddle.to_tensor([0, 1]), mask]
 
+    def test_slice(self):
+        with _test_eager_guard():
+            self.func_test_slice()
+        self.func_test_slice()
+
     def test_var_base_to_np(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array)
@@ -1125,7 +1131,6 @@ class TestVarBase(unittest.TestCase):
 
 class TestVarBaseSetitem(unittest.TestCase):
     def setUp(self):
-        paddle.disable_static()
         self.set_dtype()
         self.tensor_x = paddle.to_tensor(np.ones((4, 2, 3)).astype(self.dtype))
         self.np_value = np.random.random((2, 3)).astype(self.dtype)
@@ -1135,12 +1140,13 @@ class TestVarBaseSetitem(unittest.TestCase):
         self.dtype = "int32"
 
     def _test(self, value):
-        paddle.disable_static()
-        self.assertEqual(self.tensor_x.inplace_version, 0)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 0)
 
         id_origin = id(self.tensor_x)
         self.tensor_x[0] = value
-        self.assertEqual(self.tensor_x.inplace_version, 1)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 1)
 
         if isinstance(value, (six.integer_types, float)):
             result = np.zeros((2, 3)).astype(self.dtype) + value
@@ -1152,27 +1158,47 @@ class TestVarBaseSetitem(unittest.TestCase):
         self.assertEqual(id_origin, id(self.tensor_x))
 
         self.tensor_x[1:2] = value
-        self.assertEqual(self.tensor_x.inplace_version, 2)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 2)
         self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result))
         self.assertEqual(id_origin, id(self.tensor_x))
 
         self.tensor_x[...] = value
-        self.assertEqual(self.tensor_x.inplace_version, 3)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 3)
         self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result))
         self.assertEqual(id_origin, id(self.tensor_x))
 
-    def test_value_tensor(self):
-        paddle.disable_static()
+    def func_test_value_tensor(self):
         self._test(self.tensor_value)
 
-    def test_value_numpy(self):
-        paddle.disable_static()
+    def test_value_tensor(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_test_value_tensor()
+        self.setUp()
+        self.func_test_value_tensor()
+
+    def func_test_value_numpy(self):
         self._test(self.np_value)
 
-    def test_value_int(self):
-        paddle.disable_static()
+    def test_value_numpy(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_test_value_numpy()
+        self.setUp()
+        self.func_test_value_numpy()
+
+    def func_test_value_int(self):
         self._test(10)
 
+    def test_value_int(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_test_value_int()
+        self.setUp()
+        self.func_test_value_int()
+
 
 class TestVarBaseSetitemInt64(TestVarBaseSetitem):
     def set_dtype(self):
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index f3763cb447f39688409b141a8137d89e3a7f236a..1c7e4fb5f1ad04b4216f6c8ded1ebaa9a46e3134 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -382,7 +382,7 @@ def _getitem_impl_(var, item):
             idx = assign(np.array(slice_item).astype("int32"))
             return index_select(var, index=idx, axis=0)
 
-        elif isinstance(slice_item, (Variable)):
+        elif isinstance(slice_item, (Variable, core.eager.Tensor)):
             if len(item) == 1:
 
                 from ..tensor import index_select, gather_nd
@@ -636,7 +636,7 @@ def _setitem_impl_(var, item, value):
         shape = list(value.shape)
         if dtype == core.VarDesc.VarType.BOOL:
             value_name = "bool_values"
-            values = [bool(v) for v in value.flat]
+            values = [int(v) for v in value.flat]
         elif dtype == core.VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in value.flat]
@@ -657,7 +657,7 @@ def _setitem_impl_(var, item, value):
         attrs[value_name] = values
         attrs["shape"] = shape
 
-    elif isinstance(value, Variable):
+    elif isinstance(value, (Variable, core.eager.Tensor)):
         inputs["ValueTensor"] = value
     else:
         raise TypeError(
@@ -665,7 +665,9 @@ def _setitem_impl_(var, item, value):
             "paddle.Tensor to a paddle.Tensor, but received {}".format(
                 type(value)))
 
-    if paddle.fluid.framework.in_dygraph_mode():
+    if paddle.fluid.framework.in_dygraph_mode(
+    ) and not paddle.fluid.framework._in_eager_mode():
+        # TODO(pangyoki) add inplace(BumpInplaceVersion) if need
         var._bump_inplace_version()
 
     cur_block = default_main_program().current_block()
diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py
index 4999e703f2a5a31be2cd5c20b70bc7b9dfb7e60a..ae190b8a7846cd3c0d765f1831914df2ab98c77f 100644
--- a/python/paddle/profiler/__init__.py
+++ b/python/paddle/profiler/__init__.py
@@ -20,7 +20,7 @@ from .utils import RecordEvent, load_profiler_result
 from .profiler_statistic import SortedKeys
 
 __all__ = [
-    'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler',
+    'ProfilerState', 'ProfilerTarget', 'make_scheduler',
     'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
     'load_profiler_result', 'SortedKeys'
 ]
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index dc637bf983046b8025962257744b0e1bb4763b4b..efbe88583b776d623b757628998e583ac65f6179 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,7 +24,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
                                TracerEventType)
 
 from .utils import RecordEvent, wrap_optimizers
-from .profiler_statistic import SortedKeys
+from .profiler_statistic import StatisticData, _build_table, SortedKeys
 
 
 class ProfilerState(Enum):
@@ -32,21 +32,28 @@ class ProfilerState(Enum):
     Profiler state that can be specified to control profiler action.
 
     CLOSED: The profilers are closed.
+
     READY:  The profilers are open, but the data will not be recorded.
-            This state is used for reducing overhead influence when profilers start.
+    This state is used for reducing overhead influence when profilers start.
+
     RECORD: The profilers are open, and the data will be recorded.
-    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, 
-            the collected data will be returned.
+
+    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
+    the collected data will be returned.
     """
     CLOSED = 0
     READY = 1
     RECORD = 2
-    RECORD_AND_RETURN = 3  # the last step of RECORD 
+    RECORD_AND_RETURN = 3  # the last step of RECORD
 
 
 class ProfilerTarget(Enum):
     r"""
     Target device for profiling.
+
+    CPU: Profile events on CPU.
+    
+    GPU: Profile events on GPU.
     """
     CPU = 0
     GPU = 1
@@ -62,17 +69,19 @@ def make_scheduler(*,
     Return a scheduler function, which scheduler the state according to the setting.
     The state transform confirms to:
 
-    (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
-    START -> skip_first -> closed -> ready    ->    record       ->      END
-                            |                        |
-                            |                        | (if has_repeated < repeat)
-                            - - - - - - - - - - - -
-    Note that repeat <= 0 means the cycle will continue until the profiler exits.    
+    .. code-block:: text
+
+        (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
+        START -> skip_first -> closed -> ready    ->    record       ->      END
+                                |                        |
+                                |                        | (if has_repeated < repeat)
+                                - - - - - - - - - - - -
+        Note that repeat <= 0 means the cycle will continue until the profiler exits.
 
     Parameters:
         closed(int): The number of steps in state ProfilerState.CLOSED.
-        ready(int):  The number of steps in state ProfilerState.READY. 
-        record(int): The number of steps in state ProfilerState.RECORD.    
+        ready(int):  The number of steps in state ProfilerState.READY.
+        record(int): The number of steps in state ProfilerState.RECORD.
         repeat(int): The number of cycles to repeat above state transform.
         skip_first(int): The number of first steps to drop, not participate in the state transform.
 
@@ -81,13 +90,23 @@ def make_scheduler(*,
 
     Examples:
         1. profiling range [2, 5]
+
         batch 0: closed, batch 1: ready, batch [2, 5] record
-        .. code-block:: python
-        make_scheduler(closed=1, ready=1, record=4, repeat=1)
+
+            .. code-block:: python
+
+                import paddle.profiler as profiler
+                profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
+
+
         2. profiling range [3,6], [9,12], [15,18]...
+
         batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
-        .. code-block:: python
-        make_scheduler(closed=1, ready=1, record=4, skip_first=1)
+
+            .. code-block:: python
+
+                import paddle.profiler as profiler
+                profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
     """
 
     def getScheduleState(step: int) -> ProfilerState:
@@ -138,15 +157,16 @@ def export_chrome_tracing(dir_name: str,
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (3, 10),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            # required: gpu
+            import paddle.profiler as profiler
+            with profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 10),
+                    on_trace_ready=profiler.export_protobuf('./log')) as p:
+                for iter in range(10):
+                    #train()
+                    p.step()
     """
     if not os.path.exists(dir_name):
         try:
@@ -181,15 +201,16 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (3, 10),
-                            on_trace_ready = profiler.export_protobuf('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            # required: gpu
+            import paddle.profiler as profiler
+            with profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 10),
+                    on_trace_ready = profiler.export_protobuf('./log')) as p:
+                for iter in range(10):
+                    #train()
+                    p.step()
     """
     if not os.path.exists(dir_name):
         try:
@@ -216,7 +237,7 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
     r"""
     Get the current supported profiler target in the system.
     """
-    if paddle.device.is_compiled_with_cuda():
+    if _Profiler.is_cupti_supported():
         return [ProfilerTarget.CPU, ProfilerTarget.GPU]
     return [ProfilerTarget.CPU]
 
@@ -226,48 +247,56 @@ class Profiler:
     Profiler context manager, user interface to manage profile process.
 
     Parameters:
-        targets (iterable): list of tracing targets, currently supported values:
-        ``paddle.profiler.ProfilerTarget.CPU``,
-        ``paddle.profiler.ProfilerTarget.GPU``.
-        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. 
-            If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+        targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` .
+        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
+            If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
             which means profiling range [start_batch, end_batch).
         on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
-            
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+
     Examples:
         1. profiling range [2, 5)
-        .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (2, 5),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                with profiler.Profiler(
+                        targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                        scheduler = (2, 5),
+                        on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
+                    for iter in range(10):
+                        #train()
+                        p.step()
+
         2. profiling range [2,4], [7, 9], [11,13]
-        .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                with profiler.Profiler(
+                        targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                        scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
+                        on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
+                    for iter in range(10):
+                        #train()
+                        p.step()
+
         3. Use profiler without context manager, and use default parameters
-        .. code-block:: python
-        import paddle.profiler as profiler
-        p = profiler.Profiler()
-        p.start()
-        for iter in range(N):
-            train()
-            p.step()
-        p.stop()
-        p.summary()
+
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                p = profiler.Profiler()
+                p.start()
+                for iter in range(10):
+                    #train()
+                    p.step()
+                p.stop()
+                p.summary()
+
     """
 
     def __init__(
@@ -334,7 +363,22 @@ class Profiler:
     def start(self):
         r'''
         Start profiler and enter the first profiler step(0).
-        State transformed from CLOSED to self.current_state and trigger corresponding action. 
+        State transformed from CLOSED to self.current_state and trigger corresponding action.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (1, 9),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
         '''
         # CLOSED -> self.current_state
         if self.current_state == ProfilerState.READY:
@@ -354,6 +398,21 @@ class Profiler:
         r'''
         Stop profiler and State transformed from self.current_state to CLOSED.
         Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (1, 7),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
         '''
         # self.current_state -> CLOSED
         # In this situation, RECORD state is regarded as RECORD_AND_RETURN
@@ -375,6 +434,22 @@ class Profiler:
         r"""
         Signals the profiler that the next profiling step has started.
         Get the new ProfilerState and trigger corresponding action.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 7),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
         """
         if self.record_event:
             self.record_event.end()
@@ -448,6 +523,21 @@ class Profiler:
     def export(self, path="", format="json"):
         r"""
         Exports the tracing data in Chrome tracing data format.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 7))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
+                prof.export(path="./profiler_data.json", format="json")
         """
         if self.profiler_result:
             self.profiler_result.save(path, format)
@@ -461,9 +551,35 @@ class Profiler:
         Print the Summary table.
 
         Parameters:
-            sorted_by: how to rank the op table items.
-            detail: expand each operator detail information.
-            thread_sep: print op table each thread.
-            time_unit: can be chosen form ['s', 'ms', 'us', 'ns']
+            sorted_by(SortedKeys): how to rank the op table items.
+            op_detail(bool): expand each operator detail information.
+            thread_sep(bool): print op table each thread.
+            time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns']
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 7),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
+                prof.summary(sorted_by=profiler.SortedKeys.CPUTotal, op_detail=True, thread_sep=False, time_unit='ms')
         """
-        pass
+        if self.profiler_result:
+            statistic_data = StatisticData(
+                self.profiler_result.get_data(),
+                self.profiler_result.get_extra_info())
+            print(
+                _build_table(
+                    statistic_data,
+                    sorted_by=sorted_by,
+                    op_detail=op_detail,
+                    thread_sep=thread_sep,
+                    time_unit=time_unit))
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 7400f21e91365efeaef6a03d008691bdc837131b..a0bbd6b633ef017dc983c8458eb5551494425989 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,6 +34,22 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc']
 class SortedKeys(Enum):
     r"""
     Sorted keys for printing summary table.
+
+    CPUTotal: Sorted by CPU total time.
+
+    CPUAvg: Sorted by CPU average time.
+
+    CPUMax: Sorted by CPU max time.
+
+    CPUMin: Sorted by CPU min time.
+
+    GPUTotal: Sorted by GPU total time.
+
+    GPUAvg: Sorted by GPU average time.
+
+    GPUMax: Sorted by GPU max time.
+
+    GPUMin: Sorted by GPU min time.
     """
     CPUTotal = 0
     CPUAvg = 1
@@ -642,6 +658,171 @@ def _build_table(statistic_data,
     append('')
     append('')
 
+    ###### Print Model Summary Report ######
+    model_perspective_items = statistic_data.event_summary.model_perspective_items
+    if model_perspective_items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 15
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Model Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        accmulation_time = 0
+        row_values = [
+            'Total Time', '-', '{} / - / - / - / {}'.format(
+                format_time(
+                    total_time, unit=time_unit), format_ratio(1)),
+            '- / - / - / -/ -'
+        ]
+        append(row_format.format(*row_values))
+        for name in ['Dataloader', 'Forward', 'Backward', 'Optimization']:
+            if name in model_perspective_items:
+                item = model_perspective_items[name]
+                row_values = [
+                    '  {}'.format(name), item.call,
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_time))
+                ]
+                append(row_format.format(*row_values))
+                accmulation_time += item.cpu_time
+
+        other_time = total_time - accmulation_time
+        row_values = [
+            '  Others', '-', '{} / - / - / - / {}'.format(
+                format_time(
+                    other_time, unit=time_unit),
+                format_ratio(float(other_time) / total_time)),
+            '- / - / - / - / -'
+        ]
+        append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+
+    ###### Print Distribution Summary Report ######
+    if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange:
+        headers = [
+            'Name',
+            'Total Time',
+            'Ratio (%)',
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+
+        DEFAULT_COLUMN_WIDTH = 20
+        for _ in headers:
+            add_column(DEFAULT_COLUMN_WIDTH)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Distribution Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        cpu_communication_time_range = []
+        gpu_communication_time_range = []
+        cpu_communication_time_range = merge_ranges(
+            statistic_data.time_range_summary.CPUTimeRange[
+                TracerEventType.Communication], cpu_communication_time_range)
+        kernel_time_range = []
+        for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+        ):
+            kernel_time_range = merge_ranges(
+                device_time_ranges[TracerEventType.Kernel],
+                kernel_time_range,
+                is_sorted=True)
+            gpu_communication_time_range = merge_ranges(
+                device_time_ranges[TracerEventType.Communication],
+                gpu_communication_time_range,
+                is_sorted=True)
+        communication_time_range = merge_ranges(
+            cpu_communication_time_range,
+            gpu_communication_time_range,
+            is_sorted=True)
+        computation_time_range = subtract_ranges(kernel_time_range,
+                                                 gpu_communication_time_range)
+        overlap_time_range = intersection_ranges(communication_time_range,
+                                                 computation_time_range)
+        communication_time = sum_ranges(communication_time_range)
+        computation_time = sum_ranges(computation_time_range)
+        overlap_time = sum_ranges(overlap_time_range)
+        row_values = [
+            'Communication', format_time(
+                communication_time, unit=time_unit),
+            format_ratio(float(communication_time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+        row_values = [
+            'Computation', format_time(
+                computation_time, unit=time_unit),
+            format_ratio(float(computation_time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+        row_values = [
+            'Overlap', format_time(
+                overlap_time, unit=time_unit),
+            format_ratio(float(overlap_time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+        append(header_sep)
+        append(
+            "Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n"
+            "Computation time: Kernel time, substract kernels belong to communication op.\n"
+            "Overlap time: Communication time intersect with computation time.\n"
+            "Example:\n"
+            "Communication:\n"
+            "  CPU:              |_________________|\n"
+            "  GPU:                                  |______________|\n"
+            "  Total:            |_________________| |______________|\n"
+            "Computation time(Kernel):\n"
+            "  GPU:         |________________|\n"
+            "Overlap time:       |___________|\n")
+        append('-' * line_length)
+        append('')
+        append('')
+
     ###### Print Operator Summary Report ######
     if statistic_data.event_summary.items:
         headers = [
@@ -708,11 +889,6 @@ def _build_table(statistic_data,
                 sorted_items = sorted(
                     items.items(), key=lambda x: x[1].min_gpu_time)
 
-            total_cpu_time = 0
-            total_gpu_time = 0
-            for name, item in sorted_items:
-                total_cpu_time += item.cpu_time
-                total_gpu_time += item.gpu_time
             for name, item in sorted_items:
                 row_values = [
                     name, item.call, '{} / {} / {} / {} / {}'.format(
@@ -724,7 +900,7 @@ def _build_table(statistic_data,
                             item.max_cpu_time, unit=time_unit),
                         format_time(
                             item.min_cpu_time, unit=time_unit),
-                        format_ratio(float(item.cpu_time) / total_cpu_time)),
+                        format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
                         format_time(
                             item.gpu_time, unit=time_unit),
@@ -734,7 +910,7 @@ def _build_table(statistic_data,
                             item.max_gpu_time, unit=time_unit),
                         format_time(
                             item.min_gpu_time, unit=time_unit),
-                        format_ratio(float(item.gpu_time) / total_gpu_time))
+                        format_ratio(float(item.gpu_time) / total_time))
                 ]
                 append(row_format.format(*row_values))
                 if op_detail:
@@ -752,8 +928,7 @@ def _build_table(statistic_data,
                                 format_time(
                                     innerop_node.min_cpu_time, unit=time_unit),
                                 format_ratio(
-                                    float(innerop_node.cpu_time) /
-                                    total_cpu_time)),
+                                    float(innerop_node.cpu_time) / total_time)),
                             '{} / {} / {} / {} / {}'.format(
                                 format_time(
                                     innerop_node.gpu_time, unit=time_unit),
@@ -764,8 +939,7 @@ def _build_table(statistic_data,
                                 format_time(
                                     innerop_node.min_gpu_time, unit=time_unit),
                                 format_ratio(
-                                    float(innerop_node.gpu_time) /
-                                    total_gpu_time))
+                                    float(innerop_node.gpu_time) / total_time))
                         ]
                         append(row_format.format(*row_values))
                         for device_node_name, devicenode in innerop_node.devices.items(
@@ -792,7 +966,7 @@ def _build_table(statistic_data,
                                         unit=time_unit),
                                     format_ratio(
                                         float(devicenode.gpu_time) /
-                                        total_gpu_time))
+                                        total_time))
                             ]
                             append(row_format.format(*row_values))
                     for device_node_name, device_node in item.devices.items():
@@ -814,11 +988,160 @@ def _build_table(statistic_data,
                                 format_time(
                                     devicenode.min_gpu_time, unit=time_unit),
                                 format_ratio(
-                                    float(devicenode.gpu_time) /
-                                    total_gpu_time))
+                                    float(devicenode.gpu_time) / total_time))
                         ]
                         append(row_format.format(*row_values))
         append(header_sep)
         append('')
         append('')
+
+    ###### Print Memory Manipulation Summary Report ######
+    if statistic_data.event_summary.memory_manipulation_items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 30
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Memory Manipulation Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
+        for name, item in memory_manipulation_items.items():
+            row_values = [
+                name,
+                item.call,
+                '{} / {} / {} / {} / {}'.format(
+                    format_time(
+                        item.cpu_time, unit=time_unit),
+                    format_time(
+                        item.avg_cpu_time, unit=time_unit),
+                    format_time(
+                        item.max_cpu_time, unit=time_unit),
+                    format_time(
+                        item.min_cpu_time, unit=time_unit),
+                    format_ratio(float(item.cpu_time) / total_time)),
+                '{} / {} / {} / {} / {}'.format(
+                    format_time(
+                        item.gpu_time, unit=time_unit),
+                    format_time(
+                        item.avg_gpu_time, unit=time_unit),
+                    format_time(
+                        item.max_gpu_time, unit=time_unit),
+                    format_time(
+                        item.min_gpu_time, unit=time_unit),
+                    format_ratio(float(item.gpu_time) / total_time)),
+            ]
+            append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+    ###### Print UserDefined Summary Report ######
+    if statistic_data.event_summary.userdefined_items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 30
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "UserDefined Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        if thread_sep == True:
+            userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
+        else:
+            userdefined_thread_items = {
+                'All threads merged':
+                statistic_data.event_summary.userdefined_items
+            }
+        for thread_id, items in userdefined_thread_items.items():
+            append(add_title(line_length, "Thread: {}".format(thread_id)))
+            if sorted_by == SortedKeys.CPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+            elif sorted_by == SortedKeys.CPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_cpu_time)
+            elif sorted_by == SortedKeys.GPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            elif sorted_by == SortedKeys.GPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_gpu_time)
+
+            for name, item in sorted_items:
+                row_values = [
+                    name,
+                    item.call,
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_time)),
+                ]
+                append(row_format.format(*row_values))
+            append(header_sep)
     return ''.join(result)
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 642001dfbfc5a307d5064860136034ba7b3bdbc5..7fa7a27bad7bf5ffbefdddb28d67e2d65e319e6d 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -1,24 +1,25 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.core import (_RecordEvent, TracerEventType,
-                               load_profiler_result)
 from typing import Any
 from warnings import warn
 import functools
 from contextlib import ContextDecorator
 
+from paddle.fluid.core import (_RecordEvent, TracerEventType)
+import paddle.fluid.core as core
+
 _AllowedEventTypeList = [
     TracerEventType.Dataloader, TracerEventType.ProfileStep,
     TracerEventType.UserDefined, TracerEventType.Forward,
@@ -32,14 +33,28 @@ class RecordEvent(ContextDecorator):
     Interface for recording a time range.
 
     Parameters:
-    name(str): Name of the record event
-    event_type(TracerEventType): Type of the record event, can be used for statistics.
+        name(str): Name of the record event
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
-            op1()
+
+            import paddle
+            import paddle.profiler as profiler
+            # method1: using context manager
+            with profiler.RecordEvent("record_add"):
+                data1 = paddle.randn(shape=[3])
+                data2 = paddle.randn(shape=[3])
+                result = data1 + data2
+            # method2: call begin() and end()
+            record_event = profiler.RecordEvent("record_add")
+            record_event.begin()
+            data1 = paddle.randn(shape=[3])
+            data2 = paddle.randn(shape=[3])
+            result = data1 + data2
+            record_event.end()
+
+    Note:
+        RecordEvent will take effect only when profiler is on and at the state of RECORD.
     """
 
     def __init__(self,
@@ -57,6 +72,20 @@ class RecordEvent(ContextDecorator):
         self.end()
 
     def begin(self):
+        r"""
+        Record the time of begining.
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.profiler as profiler
+            record_event = profiler.RecordEvent("record_sub")
+            record_event.begin()
+            data1 = paddle.randn(shape=[3])
+            data2 = paddle.randn(shape=[3])
+            result = data1 - data2
+            record_event.end()
+        """
         if self.event_type not in _AllowedEventTypeList:
             warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
                   can be recorded.".format(*_AllowedEventTypeList))
@@ -67,10 +96,51 @@ class RecordEvent(ContextDecorator):
             self.event = _RecordEvent(self.name, self.event_type)
 
     def end(self):
+        r'''
+        Record the time of ending.
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.profiler as profiler
+            record_event = profiler.RecordEvent("record_mul")
+            record_event.begin()
+            data1 = paddle.randn(shape=[3])
+            data2 = paddle.randn(shape=[3])
+            result = data1 * data2
+            record_event.end()
+        '''
         if self.event:
             self.event.end()
 
 
+def load_profiler_result(filename: str):
+    r"""
+    Load dumped profiler data back to memory.
+
+    Parameters:
+        filename(str): Name of the exported protobuf file of profiler data.
+
+    Returns:
+        ProfilerResult object.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle.profiler as profiler
+            with profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 10)) as p:
+                for iter in range(10):
+                    #train()
+                    p.step()
+            p.export('test_export_protobuf.pb', format='pb')
+            profiler_result = profiler.load_profiler_result('test_export_protobuf.pb')
+    """
+    return core.load_profiler_result(filename)
+
+
 def wrap_optimizers():
     def optimizer_warpper(func):
         @functools.wraps(func)
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index cd8ba2b58a8c939acc43a93b0ea6ca5a617b35d1..f5b225bc6da2d9e93b8c697d600183a9217d0e08 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -119,10 +119,11 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
             f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
         )
 
-    if frame_length > x.shape[axis]:
-        raise ValueError(
-            f'Attribute frame_length should be less equal than sequence length, '
-            f'but got ({frame_length}) > ({x.shape[axis]}).')
+    if in_dygraph_mode():
+        if frame_length > x.shape[axis]:
+            raise ValueError(
+                f'Attribute frame_length should be less equal than sequence length, '
+                f'but got ({frame_length}) > ({x.shape[axis]}).')
 
     op_type = 'frame'
 
@@ -306,8 +307,7 @@ def stft(x,
             y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
     """
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'complex64', 'complex128'],
-        'stft')
+        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft')
 
     x_rank = len(x.shape)
     assert x_rank in [1, 2], \
@@ -325,8 +325,9 @@ def stft(x,
     if win_length is None:
         win_length = n_fft
 
-    assert 0 < n_fft <= x.shape[-1], \
-        f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
+    if in_dygraph_mode():
+        assert 0 < n_fft <= x.shape[-1], \
+            f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
 
     assert 0 < win_length <= n_fft, \
         f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
@@ -359,7 +360,7 @@ def stft(x,
     x_frames = x_frames.transpose(
         perm=[0, 2,
               1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
-    x_frames = x_frames * window
+    x_frames = paddle.multiply(x_frames, window)
 
     norm = 'ortho' if normalized else 'backward'
     if is_complex(x_frames):
@@ -495,18 +496,22 @@ def istft(x,
     n_frames = x.shape[-1]
     fft_size = x.shape[-2]
 
-    if onesided:
-        assert (fft_size == n_fft // 2 + 1), \
-            'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
-    else:
-        assert (fft_size == n_fft), \
-            'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
+    if in_dygraph_mode():
+        if onesided:
+            assert (fft_size == n_fft // 2 + 1), \
+                'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
+        else:
+            assert (fft_size == n_fft), \
+                'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
 
     if window is not None:
         assert len(window.shape) == 1 and len(window) == win_length, \
             'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
     else:
-        window = paddle.ones(shape=(win_length, ))
+        window_dtype = paddle.float32 if x.dtype in [
+            paddle.float32, paddle.complex64
+        ] else paddle.float64
+        window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
 
     if win_length < n_fft:
         pad_left = (n_fft - win_length) // 2
@@ -534,15 +539,15 @@ def istft(x,
             x = x[:, :, :n_fft // 2 + 1]
         out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
 
+    out = paddle.multiply(out, window).transpose(
+        perm=[0, 2, 1])  # (batch, n_fft, num_frames)
     out = overlap_add(
-        x=(out * window).transpose(
-            perm=[0, 2, 1]),  # (batch, n_fft, num_frames)
-        hop_length=hop_length,
-        axis=-1)  # (batch, seq_length)
+        x=out, hop_length=hop_length, axis=-1)  # (batch, seq_length)
 
     window_envelop = overlap_add(
         x=paddle.tile(
-            x=window * window, repeat_times=[n_frames, 1]).transpose(
+            x=paddle.multiply(window, window).unsqueeze(0),
+            repeat_times=[n_frames, 1]).transpose(
                 perm=[1, 0]),  # (n_fft, num_frames)
         hop_length=hop_length,
         axis=-1)  # (seq_length, )
@@ -561,7 +566,7 @@ def istft(x,
         window_envelop = window_envelop[start:start + length]
 
     # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
-    if window_envelop.abs().min().item() < 1e-11:
+    if in_dygraph_mode() and window_envelop.abs().min().item() < 1e-11:
         raise ValueError(
             'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).'
         )
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 28fd30e926c6d1b3d1df92a58ffafd017d3a9fec..ce95f4500ed397da78308d682e38ea7c3b85cf50 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -147,7 +147,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
             check_variable_and_dtype(
-                val, name, ['float16', 'float32', 'float64'], 'matmul')
+                val, name,
+                ['float16', 'float32', 'float64', 'complex64', 'complex128'],
+                'matmul')
 
     __check_input(x, y)
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index da383db0effea95abc1afebbfcadc6f4fc3a3a45..ff8265795457bb1189488fb6d9925a3a7c79de70 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -243,8 +243,8 @@ def add(x, y, name=None):
     """
 
     if paddle.in_dynamic_mode():
-        #if _in_eager_mode():
-        #return _C_ops.final_state_add(x, y)
+        if _in_eager_mode():
+            return _C_ops.final_state_add( x, y)
         return _C_ops.elementwise_add(x, y)
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
@@ -324,8 +324,8 @@ def subtract(x, y, name=None):
     axis = -1
     act = None
     if paddle.in_dynamic_mode():
-        # if _in_eager_mode():
-        #     return _C_ops.final_state_subtract( x, y)
+        if _in_eager_mode():
+            return _C_ops.final_state_subtract(x, y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -383,6 +383,8 @@ def divide(x, y, name=None):
     axis = -1
     act = None
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_divide( x, y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -512,6 +514,8 @@ def multiply(x, y, name=None):
     axis = -1
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_multiply(x, y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -3801,13 +3805,13 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         attrs_1 += ('starts', starts_1)
         ends_1 = [dim_len - 1]
         attrs_1 += ('ends', ends_1)
-        input_front = _C_ops.slice(new_input, None, None, 'axes', axes, \
+        input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
             'infer_flags', infer_flags, *attrs_1)
         starts_2 = [1]
         attrs_2 += ('starts', starts_2)
         ends_2 = [dim_len]
         attrs_2 += ('ends', ends_2)
-        input_back = _C_ops.slice(new_input, None, None, 'axes', axes, \
+        input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
             'infer_flags', infer_flags, *attrs_2)
 
         if x.dtype == paddle.bool:
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index f164bbc466f18da9b7145533c32369a85d6124df..91e5cfe97c6cdb503fba343a8a8d16a956aaffaf 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -317,7 +317,7 @@ def tensor_to_string(tensor, prefix='Tensor'):
 
     _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
 
-    if not tensor._is_initialized():
+    if not tensor._is_dense_tensor_hold_allocation():
         return "Tensor(Not initialized)"
 
     if tensor.is_sparse():
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index ee6503a39c6f126b52db23d302a59b2ef1196f74..6033e2b588cf4dc48d39fb673bf1502b7d77fc97 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -5,7 +5,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : add
-  # backward : add_grad
+  backward : add_grad
 
 - api : cast
   args : (Tensor x, DataType out_dtype)
@@ -47,6 +47,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : divide
+  backward : divide_grad
 
 - api : dot
   args : (Tensor x, Tensor y)
@@ -136,6 +137,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : multiply
+  backward : multiply_grad
 
 - api : ones_like
   args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
@@ -208,6 +210,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : subtract
+  backward : subtract_grad
 
 - api : sum
   args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
@@ -1314,7 +1317,7 @@
     func : AdamaxInferMeta
   kernel :
     func : adamax
-  
+
 
 
 - api : where
@@ -1370,7 +1373,7 @@
     func : CompareInferMeta
   kernel :
     func : equal
-  
+
 - api : not_equal
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 938a381bf2a4ff9981c71aaf6b0844b7377860a7..fde756a09c67199a82a0eb73f8940bb11de0d7fe 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -25,10 +25,9 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
-
 - backward_api : add_grad
   forward : add (Tensor x, Tensor y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad)
   infer_meta :
     func : GeneralBinaryGradInferMeta
@@ -36,6 +35,37 @@
   kernel :
     func : add_grad
 
+- backward_api : subtract_grad
+  forward : subtract (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : subtract_grad
+
+- backward_api : multiply_grad
+  forward : multiply (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : multiply_grad
+
+- backward_api : divide_grad
+  forward : divide (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : divide_grad
+
+
 - backward_api : digamma_grad
   forward : digamma (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -490,7 +520,7 @@
 #     param : [out, out_grad, axis]
 #   kernel :
 #     func : gumbel_softmax_grad
-  
+
 
 - backward_api : transpose_grad
   forward : transpose (Tensor x, int[] axis) -> Tensor(out)
@@ -501,7 +531,7 @@
     param : [out_grad, axis]
   kernel :
     func : transpose_grad
-  
+
 # - backward_api : lerp_grad
 #   forward : transpose (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
 #   args : (Tensor x, Tensor y, Tensor weight, Tensor out, Tensor out_grad)
diff --git a/python/setup.py.in b/python/setup.py.in
index 0a10e9dcc698d7433ceab5dd35ee4d5fa1729636..2dbefb20bb6e63c9457d67346aa43ae9d67df07e 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -733,7 +733,7 @@ with redirect_stdout():
         },
         entry_points={
             'console_scripts': [
-                'fleetrun = paddle.distributed.launch.__main__:launch'
+                'fleetrun = paddle.distributed.launch.main:launch'
             ]
         },
         classifiers=[