diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bd09715e0a7077e140c78dceb1fc8bae39de4582..816882418964313e8062ef14a7bc108e63dcaf78 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -61,7 +61,7 @@ repos:
     -   id: black
         files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/pycqa/isort
-    rev: 5.10.1
+    rev: 5.11.5
     hooks:
     -   id: isort
 -   repo: https://github.com/PyCQA/flake8
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 3ec194a6bfb37791f8aae7c65bfe57718dfbbadf..5e23a0f36f04acdf8aa2c480816ac87df0bf697c 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -40,7 +40,10 @@ set(CINN_OPTIONAL_ARGS
     -DWITH_MKL_CBLAS=${WITH_MKL}
     -DWITH_MKLDNN=${WITH_MKL}
     -DPUBLISH_LIBS=ON
-    -DWITH_TESTING=ON)
+    -DWITH_TESTING=ON
+    -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}
+    -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}
+    -DPYTHON_LIBRARIES=${PYTHON_LIBRARIES})
 set(CINN_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cinnapi -j)
 set(CINN_BINARY_DIR ${CINN_PREFIX_DIR}/src/external_cinn-build)
 set(CINN_LIB_NAME "libcinnapi.so")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 8e08eb84b9f3577dc072f8ffe31d231e03cfea90..fcdaa8da1f30eda555b742563ad8007903d90ab8 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -411,6 +411,17 @@ function(op_library TARGET)
       set(pybind_flag 1)
     endif()
 
+    # pybind USE_OP_ITSELF
+    set(op_name "")
+    # Add PHI Kernel Registry Message
+    find_register(${cc_src} "REGISTER_ACTIVATION_OP" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
+      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
+      set(TARGET ${op_name})
+      set(pybind_flag 1)
+    endif()
+
     set(op_name "")
     find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name)
     if(NOT ${op_name} EQUAL "")
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 9aedaa131400f3bfd6be24953050071e8970a557..a03ac900e9f66ea1d15aee318a01cf8b87d072d5 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -50,14 +50,17 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
   auto max_ready_size = it->second.first;
   auto ready_size = it->second.second;
   ready_size += 1;
-  PADDLE_ENFORCE_LE(ready_size,
-                    max_ready_size,
-                    platform::errors::OutOfRange(
-                        "upstream=%lld ready_size must <= max_ready_size, but "
-                        "now ready_size=%lld, max_ready_size=%lld",
-                        up_id,
-                        ready_size,
-                        max_ready_size));
+  if (max_ready_size != INFINITE_BUFFER_SIZE) {
+    PADDLE_ENFORCE_LE(
+        ready_size,
+        max_ready_size,
+        platform::errors::OutOfRange(
+            "upstream=%lld ready_size must <= max_ready_size, but "
+            "now ready_size=%lld, max_ready_size=%lld",
+            up_id,
+            ready_size,
+            max_ready_size));
+  }
   it->second.second = ready_size;
 }
 
@@ -96,6 +99,9 @@ bool ComputeInterceptor::CanWriteOutput() {
   for (auto& outs : out_buffs_) {
     auto max_buffer_size = outs.second.first;
     auto used_size = outs.second.second;
+    if (max_buffer_size == INFINITE_BUFFER_SIZE) {
+      continue;
+    }
     // full, return false
     if (used_size == max_buffer_size) {
       VLOG(3) << "Interceptor " << GetInterceptorId()
@@ -112,15 +118,17 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
     auto max_buff_size = outs.second.first;
     auto used_size = outs.second.second;
     used_size += 1;
-    PADDLE_ENFORCE_LE(
-        used_size,
-        max_buff_size,
-        platform::errors::OutOfRange("downstream=%lld used buff size must <= "
-                                     "max_buff_size, but now used_size=%lld, "
-                                     "max_buff_size=%lld",
-                                     down_id,
-                                     used_size,
-                                     max_buff_size));
+    if (max_buff_size != INFINITE_BUFFER_SIZE) {
+      PADDLE_ENFORCE_LE(
+          used_size,
+          max_buff_size,
+          platform::errors::OutOfRange("downstream=%lld used buff size must <= "
+                                       "max_buff_size, but now used_size=%lld, "
+                                       "max_buff_size=%lld",
+                                       down_id,
+                                       used_size,
+                                       max_buff_size));
+    }
     outs.second.second = used_size;
 
     InterceptorMessage ready_msg;
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index 9709cd4437f1019fea80cf04ecce5a38f74bb463..eade47fd8787e61999a8c627af316d221e5aba48 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -22,6 +22,8 @@
 namespace paddle {
 namespace distributed {
 
+const int64_t INFINITE_BUFFER_SIZE = -1;
+
 class ComputeInterceptor : public Interceptor {
  public:
   ComputeInterceptor(int64_t interceptor_id, TaskNode* node);
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 88363696ede257492b6f703c2a8ddaa97d5b5b15..ae3776d2c5beacbccc7d63f05aff7882a9b2440a 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -111,21 +111,22 @@ void FleetExecutor::Init(
     task_node->SetUnusedVars(unused_vars);
     if (task_node->type() == "Cond") {
       std::vector<std::string> while_block_vars;
-      std::vector<std::string> vars_in_parent;
-      std::vector<std::string> vars_in_sub;
-      for (auto& var : program_desc.Block(0).AllVars()) {
-        vars_in_parent.emplace_back(var->Name());
-      }
+      VLOG(3) << "Vars in while sub block:";
       for (auto& var : program_desc.Block(1).AllVars()) {
-        vars_in_sub.emplace_back(var->Name());
+        VLOG(3) << var->Name();
+        while_block_vars.emplace_back(var->Name());
+      }
+      for (const auto& pair : unused_vars) {
+        if (pair.first->Type() == "while") {
+          for (const auto& var_name : pair.second) {
+            while_block_vars.emplace_back(var_name);
+          }
+        }
+      }
+      VLOG(3) << "Vars below will be removed after while:";
+      for (const auto& name : while_block_vars) {
+        VLOG(3) << name;
       }
-      std::sort(vars_in_parent.begin(), vars_in_parent.end());
-      std::sort(vars_in_sub.begin(), vars_in_sub.end());
-      std::set_difference(vars_in_sub.begin(),
-                          vars_in_sub.end(),
-                          vars_in_parent.begin(),
-                          vars_in_parent.end(),
-                          std::back_inserter(while_block_vars));
       task_node->SetWhileBlockVars(while_block_vars);
     }
     int64_t interceptor_id = task_node->task_id();
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 650bf0626f1ad22bdda64403fd76dd639fc5efce..3497a1217cfcba95d40482c3e158f3862c16e23b 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -330,7 +330,7 @@ NODE_CC_FILE_TEMPLATE = """
 #include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/api/all.h"
 #include "paddle/fluid/prim/utils/utils.h"
 DECLARE_bool(check_nan_inf);
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
index 571667bff47eb454c0366398faa6d04d84448219..e51f22a2c3c1818ccf96ab2888bb2df38a8b9f3c 100644
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -26,6 +26,11 @@ namespace framework {
 using FeedType =
     paddle::variant<phi::DenseTensor, Strings, phi::SparseCooTensor>;
 
+template <>
+struct PhiVectorType<FeedType> {
+  const char *type_name = "PhiVectorFeedType";
+};
+
 using FeedList = paddle::framework::PhiVector<FeedType>;
 
 using FetchType = paddle::variant<phi::DenseTensor,
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b387dc1d6cc26b5cd0cf6ea45014f8986c4035ab..cd93a7172067d4d8b845906a90edc773b1486658 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -144,6 +144,7 @@ if(WITH_TENSORRT)
   pass_library(trt_support_nhwc_pass inference)
   pass_library(elementwise_groupnorm_act_pass inference)
   pass_library(preln_elementwise_groupnorm_act_pass inference)
+  pass_library(groupnorm_act_pass inference)
   pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
   pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
 endif()
@@ -386,22 +387,10 @@ if(WITH_MKLDNN)
     test_depthwise_conv_mkldnn_pass
     SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc
     DEPS depthwise_conv_mkldnn_pass)
-  cc_test(
-    test_conv_bias_mkldnn_fuse_pass_cc
-    SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
-    DEPS conv_bias_mkldnn_fuse_pass naive_executor)
   cc_test(
     test_conv_activation_mkldnn_fuse_pass
     SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
     DEPS conv_activation_mkldnn_fuse_pass)
-  cc_test(
-    test_conv_concat_relu_mkldnn_fuse_pass
-    SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
-    DEPS conv_activation_mkldnn_fuse_pass)
-  cc_test_old(
-    test_conv_elementwise_add_mkldnn_fuse_pass SRCS
-    mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS
-    conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
   cc_test_old(
     test_int8_scale_calculation_mkldnn_pass SRCS
     mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc DEPS
diff --git a/paddle/fluid/framework/ir/fused_attention_pass.cc b/paddle/fluid/framework/ir/fused_attention_pass.cc
index 72fa90db9b15a920d3ad4ee33d80d8e9f84455dd..dcf5f05e643ebd267e956b3d5661c70674cab447 100644
--- a/paddle/fluid/framework/ir/fused_attention_pass.cc
+++ b/paddle/fluid/framework/ir/fused_attention_pass.cc
@@ -22,7 +22,6 @@ namespace patterns {
 
 PDNode* FusedAttentionPattern::operator()(PDNode* x,
                                           bool pre_layer_norm,
-                                          bool post_layer_norm,
                                           bool has_attn_mask,
                                           bool do_dropout,
                                           bool add_residual) {
@@ -259,7 +258,7 @@ PDNode* FusedAttentionPattern::operator()(PDNode* x,
   out_linear_dropout_node->LinksFrom({out_linear_ele_add_out_node})
       .LinksTo({out_linear_dropout_mask_node, out_linear_dropout_out_node});
 
-  if (!add_residual && !post_layer_norm) {
+  if (!add_residual && pre_layer_norm) {
     return out_linear_dropout_out_node;
   }
 
@@ -276,7 +275,7 @@ PDNode* FusedAttentionPattern::operator()(PDNode* x,
     residual_ele_add_node->LinksFrom({x, out_linear_dropout_out_node})
         .LinksTo({residual_ele_add_out_node});
 
-    if (!post_layer_norm) {
+    if (pre_layer_norm) {
       return residual_ele_add_out_node;
     }
   }
@@ -323,13 +322,12 @@ PDNode* FusedAttentionPattern::operator()(PDNode* x,
 
 PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
                                               bool pre_layer_norm,
-                                              bool post_layer_norm,
                                               bool has_attn_mask,
                                               bool do_dropout,
                                               bool add_residual) {
   // post layer norm
   PDNode* post_layer_norm_grad_out_node{nullptr};
-  if (post_layer_norm) {
+  if (!pre_layer_norm) {
     auto* post_layer_norm_grad_node =
         pattern->NewNode(post_layer_norm_grad_op_repr())
             ->assert_is_op("layer_norm_grad");
@@ -375,7 +373,7 @@ PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
   PDNode* residual_ele_add_grad_x_grad_node{nullptr};
   if (add_residual) {
     PDNode* ele_add_grad_input = x;
-    if (post_layer_norm) {
+    if (!pre_layer_norm) {
       ele_add_grad_input = post_layer_norm_grad_out_node;
     }
     auto* residual_ele_add_grad_node =
@@ -404,7 +402,7 @@ PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
 
   // get the real input x for dropout grad
   PDNode* out_linear_grad_input_node = x;
-  if (post_layer_norm && !add_residual) {
+  if (!pre_layer_norm && !add_residual) {
     out_linear_grad_input_node = post_layer_norm_grad_out_node;
   } else if (add_residual) {
     out_linear_grad_input_node = residual_ele_add_grad_out_node;
@@ -768,12 +766,15 @@ PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
 
 void FusedAttentionsPass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
+  FusedAttentionPassCache cache;
 
-  graph = PreMaskDropResPostFwd(graph);
-  graph = PreMaskDropResPostBwd(graph);
+  graph = PreMaskDropResFwd(graph, &cache);
+  graph = PreMaskDropResBwd(graph, &cache);
+  cache.ResetCache();
 }
 
-ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
+ir::Graph* FusedAttentionsPass::PreMaskDropResFwd(
+    Graph* graph, FusedAttentionPassCache* cache) const {
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode(patterns::PDNodeName(name_scope_, "x"))
@@ -784,7 +785,6 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
 
   fused_attention_pattern(x,
                           /* pre_layer_norm */ true,
-                          /* post_layer_norm */ true,
                           /* has_attn_mask */ true,
                           /* do_dropout */ true,
                           /* add_residual */ true);
@@ -795,6 +795,8 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
                      Graph* g) {
     VLOG(3) << "handle FusedMultiHeadAttention pass's fusion";
 
+    int block_id = g->GetBlockId();
+
     GET_IR_NODE_FROM_SUBGRAPH(
         pre_layer_norm_op_node, pre_layer_norm_op, fused_attention_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
@@ -835,10 +837,258 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
                               fused_attention_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         residual_ele_add_op_node, residual_ele_add_op, fused_attention_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        fuse_qkv_matmul_w_node, fuse_qkv_matmul_w, fused_attention_pattern);
+    std::string cache_anchor_name = fuse_qkv_matmul_w_node->Var()->Name();
+
+    OpDesc fused_attention_op_desc(pre_layer_norm_op_node->Op()->Block());
+    fused_attention_op_desc.SetType("fused_attention");
+    fused_attention_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    cache->InsertIntoCache(GenerateCacheKey(cache_anchor_name, "X", block_id),
+                           subgraph.at(x));
+
+    fused_attention_op_desc.SetAttr("pre_layer_norm", true);
+    GET_IR_NODE_FROM_SUBGRAPH(pre_layer_norm_scale_node,
+                              pre_layer_norm_scale,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        pre_layer_norm_bias_node, pre_layer_norm_bias, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        pre_layer_norm_out_node, pre_layer_norm_out, fused_attention_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
-        post_layer_norm_op_node, post_layer_norm_op, fused_attention_pattern);
+        pre_layer_norm_mean_node, pre_layer_norm_mean, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pre_layer_norm_variance_node,
+                              pre_layer_norm_variance,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetInput("LnScale",
+                                     {pre_layer_norm_scale_node->Name()});
+    fused_attention_op_desc.SetInput("LnBias",
+                                     {pre_layer_norm_bias_node->Name()});
+    fused_attention_op_desc.SetOutput("LnOut",
+                                      {pre_layer_norm_out_node->Name()});
+    fused_attention_op_desc.SetOutput("LnMean",
+                                      {pre_layer_norm_mean_node->Name()});
+    fused_attention_op_desc.SetOutput("LnVariance",
+                                      {pre_layer_norm_variance_node->Name()});
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "LnScale", block_id),
+        pre_layer_norm_scale_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "LnBias", block_id),
+        pre_layer_norm_bias_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "LnOut", block_id),
+        pre_layer_norm_out_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "LnMean", block_id),
+        pre_layer_norm_mean_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "LnVariance", block_id),
+        pre_layer_norm_variance_node);
+    fused_attention_op_desc.SetAttr(
+        "epsilon",
+        PADDLE_GET_CONST(float,
+                         pre_layer_norm_op_node->Op()->GetAttr("epsilon")));
+
+    fused_attention_op_desc.SetAttr("transpose_qkv_wb", true);
+    std::vector<int> shape = PADDLE_GET_CONST(
+        std::vector<int>, fuse_qkv_reshape_op_node->Op()->GetAttr("shape"));
+    fused_attention_op_desc.SetAttr("num_heads", shape[2]);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        fuse_qkv_matmul_out_node, fuse_qkv_matmul_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_ele_add_bias_node,
+                              fuse_qkv_ele_add_bias,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_ele_add_out_node,
+                              fuse_qkv_ele_add_out,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_transpose_out_node,
+                              fuse_qkv_transpose_out,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetInput("QKVW", {fuse_qkv_matmul_w_node->Name()});
+    fused_attention_op_desc.SetInput("QKVBias",
+                                     {fuse_qkv_ele_add_bias_node->Name()});
+    fused_attention_op_desc.SetOutput("QKVOut",
+                                      {fuse_qkv_matmul_out_node->Name()});
+    fused_attention_op_desc.SetOutput("QKVBiasOut",
+                                      {fuse_qkv_ele_add_out_node->Name()});
+    fused_attention_op_desc.SetOutput("TransposeOut2",
+                                      {fuse_qkv_transpose_out_node->Name()});
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "QKVW", block_id),
+        fuse_qkv_matmul_w_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "QKVBias", block_id),
+        fuse_qkv_ele_add_bias_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "QKVOut", block_id),
+        fuse_qkv_matmul_out_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "QKVBiasOut", block_id),
+        fuse_qkv_ele_add_out_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "TransposeOut2", block_id),
+        fuse_qkv_transpose_out_node);
 
-    // TODO(Yuang Liu): finish the handler
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qk_matmul_out_node, qk_matmul_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(add_mask_ele_add_mask_node,
+                              add_mask_ele_add_mask,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(add_mask_ele_add_out_node,
+                              add_mask_ele_add_out,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qk_softmax_out_node, qk_softmax_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dropout_out_node, attn_dropout_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dropout_mask_node, attn_dropout_mask, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qkv_matmul_out_node, qkv_matmul_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qkv_reshape_out_node, qkv_reshape_out, fused_attention_pattern);
+    fused_attention_op_desc.SetOutput("QKOut", {qk_matmul_out_node->Name()});
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "QKOut", block_id),
+        qk_matmul_out_node);
+    fused_attention_op_desc.SetInput("SrcMask",
+                                     {add_mask_ele_add_mask_node->Name()});
+    fused_attention_op_desc.SetOutput("SrcMaskOut",
+                                      {add_mask_ele_add_out_node->Name()});
+    fused_attention_op_desc.SetOutput("SoftmaxOut",
+                                      {qk_softmax_out_node->Name()});
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "SrcMask", block_id),
+        add_mask_ele_add_mask_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "SrcMaskOut", block_id),
+        add_mask_ele_add_out_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "SoftmaxOut", block_id),
+        qk_softmax_out_node);
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_rate",
+        PADDLE_GET_CONST(float,
+                         attn_dropout_op_node->Op()->GetAttr("dropout_prob")));
+    fused_attention_op_desc.SetAttr(
+        "is_test",
+        PADDLE_GET_CONST(bool, attn_dropout_op_node->Op()->GetAttr("is_test")));
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_fix_seed",
+        PADDLE_GET_CONST(bool,
+                         attn_dropout_op_node->Op()->GetAttr("fix_seed")));
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_seed",
+        PADDLE_GET_CONST(int, attn_dropout_op_node->Op()->GetAttr("seed")));
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_implementation",
+        PADDLE_GET_CONST(
+            std::string,
+            attn_dropout_op_node->Op()->GetAttr("dropout_implementation")));
+    fused_attention_op_desc.SetOutput("AttnDropoutMaskOut",
+                                      {attn_dropout_mask_node->Name()});
+    fused_attention_op_desc.SetOutput("AttnDropoutOut",
+                                      {attn_dropout_out_node->Name()});
+    fused_attention_op_desc.SetOutput("QKTVOut", {qkv_matmul_out_node->Name()});
+    fused_attention_op_desc.SetOutput("FMHAOut",
+                                      {qkv_reshape_out_node->Name()});
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "AttnDropoutMaskOut", block_id),
+        attn_dropout_mask_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "AttnDropoutOut", block_id),
+        attn_dropout_out_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "QKTVOut", block_id),
+        qkv_matmul_out_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "FMHAOut", block_id),
+        qkv_reshape_out_node);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        out_linear_matmul_w_node, out_linear_matmul_w, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_matmul_out_node,
+                              out_linear_matmul_out,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_ele_add_bias_node,
+                              out_linear_ele_add_bias,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetInput("OutLinearW",
+                                     {out_linear_matmul_w_node->Name()});
+    fused_attention_op_desc.SetInput("OutLinearBias",
+                                     {out_linear_ele_add_bias_node->Name()});
+    fused_attention_op_desc.SetOutput("OutLinearOut",
+                                      {out_linear_matmul_out_node->Name()});
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "OutLinearW", block_id),
+        out_linear_matmul_w_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "OutLinearBias", block_id),
+        out_linear_ele_add_bias_node);
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "OutLinearOut", block_id),
+        out_linear_matmul_out_node);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_dropout_mask_node,
+                              out_linear_dropout_mask,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetAttr(
+        "dropout_rate",
+        PADDLE_GET_CONST(
+            float, out_linear_dropout_op_node->Op()->GetAttr("dropout_prob")));
+    fused_attention_op_desc.SetAttr(
+        "dropout_fix_seed",
+        PADDLE_GET_CONST(
+            bool, out_linear_dropout_op_node->Op()->GetAttr("fix_seed")));
+    fused_attention_op_desc.SetAttr(
+        "dropout_seed",
+        PADDLE_GET_CONST(int,
+                         out_linear_dropout_op_node->Op()->GetAttr("seed")));
+    fused_attention_op_desc.SetAttr(
+        "dropout_implementation",
+        PADDLE_GET_CONST(std::string,
+                         out_linear_dropout_op_node->Op()->GetAttr(
+                             "dropout_implementation")));
+    fused_attention_op_desc.SetOutput("DropoutMaskOut",
+                                      {out_linear_dropout_mask_node->Name()});
+    cache->InsertIntoCache(
+        GenerateCacheKey(cache_anchor_name, "DropoutMaskOut", block_id),
+        out_linear_dropout_mask_node);
+
+    GET_IR_NODE_FROM_SUBGRAPH(residual_ele_add_out_node,
+                              residual_ele_add_out,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetAttr("add_residual", true);
+    fused_attention_op_desc.SetOutput("Y", {residual_ele_add_out_node->Name()});
+
+    auto fused_attention_node = g->CreateOpNode(&fused_attention_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), fused_attention_node);
+    IR_NODE_LINK_TO(pre_layer_norm_scale_node, fused_attention_node);
+    IR_NODE_LINK_TO(pre_layer_norm_bias_node, fused_attention_node);
+    IR_NODE_LINK_TO(fuse_qkv_matmul_w_node, fused_attention_node);
+    IR_NODE_LINK_TO(fuse_qkv_ele_add_bias_node, fused_attention_node);
+    IR_NODE_LINK_TO(add_mask_ele_add_mask_node, fused_attention_node);
+    IR_NODE_LINK_TO(out_linear_matmul_w_node, fused_attention_node);
+    IR_NODE_LINK_TO(out_linear_ele_add_bias_node, fused_attention_node);
+
+    IR_NODE_LINK_TO(fused_attention_node, pre_layer_norm_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, pre_layer_norm_mean_node);
+    IR_NODE_LINK_TO(fused_attention_node, pre_layer_norm_variance_node);
+    IR_NODE_LINK_TO(fused_attention_node, fuse_qkv_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, fuse_qkv_ele_add_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, fuse_qkv_transpose_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qk_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, add_mask_ele_add_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qk_softmax_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, attn_dropout_mask_node);
+    IR_NODE_LINK_TO(fused_attention_node, attn_dropout_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qkv_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qkv_reshape_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, out_linear_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, out_linear_dropout_mask_node);
+    IR_NODE_LINK_TO(fused_attention_node, residual_ele_add_out_node);
 
     GraphSafeRemoveNodes(g,
                          {pre_layer_norm_op_node,
@@ -858,8 +1108,8 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
                           out_linear_matmul_op_node,
                           out_linear_ele_add_op_node,
                           out_linear_dropout_op_node,
-                          residual_ele_add_op_node,
-                          post_layer_norm_op_node});
+                          residual_ele_add_op_node});
+
     found_fused_attention++;
   };
 
@@ -869,18 +1119,18 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
   return graph;
 }
 
-ir::Graph* FusedAttentionsPass::PreMaskDropResPostBwd(Graph* graph) const {
+ir::Graph* FusedAttentionsPass::PreMaskDropResBwd(
+    Graph* graph, FusedAttentionPassCache* cache) const {
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode(patterns::PDNodeName(name_scope_, "x"))
                 ->AsInput()
-                ->assert_is_op_input("layer_norm_grad", "Y@GRAD");
+                ->assert_is_op_input("elementwise_add_grad", "Out@GRAD");
   patterns::FusedAttentionGradPattern fused_attention_grad_pattern(
       gpd.mutable_pattern(), "fused_attention_grad_pattern");
 
   fused_attention_grad_pattern(x,
                                /* pre_layer_norm */ true,
-                               /* post_layer_norm */ true,
                                /* has_attn_mask */ true,
                                /* do_dropout */ true,
                                /* add_residual */ true);
@@ -891,9 +1141,8 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostBwd(Graph* graph) const {
                      Graph* g) {
     VLOG(3) << "handle FusedMultiHeadAttention backward pass's fusion";
 
-    GET_IR_NODE_FROM_SUBGRAPH(post_layer_norm_grad_op_node,
-                              post_layer_norm_grad_op,
-                              fused_attention_grad_pattern);
+    int block_id = g->GetBlockId();
+
     GET_IR_NODE_FROM_SUBGRAPH(residual_ele_add_grad_op_node,
                               residual_ele_add_grad_op,
                               fused_attention_grad_pattern);
@@ -951,19 +1200,323 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostBwd(Graph* graph) const {
                               grad_accumulation_sum_op,
                               fused_attention_grad_pattern);
 
-    // TODO(Yuang Liu): finish the handler
+    OpDesc fused_attention_grad_op_desc(
+        residual_ele_add_grad_op_node->Op()->Block());
+    fused_attention_grad_op_desc.SetType("fused_attention_grad");
+    fused_attention_grad_op_desc.SetInput("Y@GRAD", {subgraph.at(x)->Name()});
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_matmul_grad_w_node,
+                              fuse_qkv_matmul_grad_w,
+                              fused_attention_grad_pattern);
+    std::string cache_anchor_name = fuse_qkv_matmul_grad_w_node->Var()->Name();
+
+    auto* x_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "X", block_id));
+    auto* attn_dropout_mask_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "AttnDropoutMaskOut", block_id));
+    auto* attn_dropout_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "AttnDropoutOut", block_id));
+    auto* dropout_mask_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "DropoutMaskOut", block_id));
+    auto* fmha_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "FMHAOut", block_id));
+    auto* ln_bias_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "LnBias", block_id));
+    auto* ln_mean_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "LnMean", block_id));
+    auto* ln_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "LnOut", block_id));
+    auto* ln_scale_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "LnScale", block_id));
+    auto* ln_variance_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "LnVariance", block_id));
+    auto* out_linear_bias_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "OutLinearBias", block_id));
+    auto* out_linear_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "OutLinearOut", block_id));
+    auto* out_linear_w_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "OutLinearW", block_id));
+    auto* qk_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "QKOut", block_id));
+    auto* qktv_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "QKTVOut", block_id));
+    auto* qkv_bias_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "QKVBias", block_id));
+    auto* qkv_bias_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "QKVBiasOut", block_id));
+    auto* qkv_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "QKVOut", block_id));
+    auto* qkv_w_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "QKVW", block_id));
+    auto* softmax_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "SoftmaxOut", block_id));
+    auto* src_mask_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "SrcMask", block_id));
+    auto* src_mask_out_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "SrcMaskOut", block_id));
+    auto* transpose_out_2_node = cache->GetNodeFromCache(
+        GenerateCacheKey(cache_anchor_name, "TransposeOut2", block_id));
+    fused_attention_grad_op_desc.SetInput("X", {x_node->Name()});
+    fused_attention_grad_op_desc.SetInput("AttnDropoutMaskOut",
+                                          {attn_dropout_mask_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("AttnDropoutOut",
+                                          {attn_dropout_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("DropoutMaskOut",
+                                          {dropout_mask_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("FMHAOut", {fmha_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("LnBias", {ln_bias_node->Name()});
+    fused_attention_grad_op_desc.SetInput("LnMean", {ln_mean_node->Name()});
+    fused_attention_grad_op_desc.SetInput("LnOut", {ln_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("LnScale", {ln_scale_node->Name()});
+    fused_attention_grad_op_desc.SetInput("LnVariance",
+                                          {ln_variance_node->Name()});
+    fused_attention_grad_op_desc.SetInput("OutLinearBias",
+                                          {out_linear_bias_node->Name()});
+    fused_attention_grad_op_desc.SetInput("OutLinearOut",
+                                          {out_linear_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("OutLinearW",
+                                          {out_linear_w_node->Name()});
+    fused_attention_grad_op_desc.SetInput("QKOut", {qk_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("QKTVOut", {qktv_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("QKVBias", {qkv_bias_node->Name()});
+    fused_attention_grad_op_desc.SetInput("QKVBiasOut",
+                                          {qkv_bias_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("QKVOut", {qkv_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("QKVW", {qkv_w_node->Name()});
+    fused_attention_grad_op_desc.SetInput("SoftmaxOut",
+                                          {softmax_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("SrcMask", {src_mask_node->Name()});
+    fused_attention_grad_op_desc.SetInput("SrcMaskOut",
+                                          {src_mask_out_node->Name()});
+    fused_attention_grad_op_desc.SetInput("TransposeOut2",
+                                          {transpose_out_2_node->Name()});
+
+    fused_attention_grad_op_desc.SetAttr("add_residual", true);
+    fused_attention_grad_op_desc.SetAttr(
+        "attn_dropout_rate",
+        PADDLE_GET_CONST(
+            float, attn_dropout_grad_op_node->Op()->GetAttr("dropout_prob")));
+    fused_attention_grad_op_desc.SetAttr(
+        "is_test",
+        PADDLE_GET_CONST(bool,
+                         attn_dropout_grad_op_node->Op()->GetAttr("is_test")));
+    fused_attention_grad_op_desc.SetAttr(
+        "attn_dropout_fix_seed",
+        PADDLE_GET_CONST(bool,
+                         attn_dropout_grad_op_node->Op()->GetAttr("fix_seed")));
+    fused_attention_grad_op_desc.SetAttr(
+        "attn_dropout_seed",
+        PADDLE_GET_CONST(int,
+                         attn_dropout_grad_op_node->Op()->GetAttr("seed")));
+    fused_attention_grad_op_desc.SetAttr(
+        "attn_dropout_implementation",
+        PADDLE_GET_CONST(std::string,
+                         attn_dropout_grad_op_node->Op()->GetAttr(
+                             "dropout_implementation")));
+    fused_attention_grad_op_desc.SetAttr(
+        "dropout_rate",
+        PADDLE_GET_CONST(
+            float,
+            out_linear_dropout_grad_op_node->Op()->GetAttr("dropout_prob")));
+    fused_attention_grad_op_desc.SetAttr(
+        "dropout_fix_seed",
+        PADDLE_GET_CONST(
+            bool, out_linear_dropout_grad_op_node->Op()->GetAttr("fix_seed")));
+    fused_attention_grad_op_desc.SetAttr(
+        "dropout_seed",
+        PADDLE_GET_CONST(
+            int, out_linear_dropout_grad_op_node->Op()->GetAttr("seed")));
+    fused_attention_grad_op_desc.SetAttr(
+        "dropout_implementation",
+        PADDLE_GET_CONST(std::string,
+                         out_linear_dropout_grad_op_node->Op()->GetAttr(
+                             "dropout_implementation")));
+    fused_attention_grad_op_desc.SetAttr(
+        "epsilon",
+        PADDLE_GET_CONST(
+            float, pre_layer_norm_grad_op_node->Op()->GetAttr("epsilon")));
+    std::vector<int> shape =
+        PADDLE_GET_CONST(std::vector<int>,
+                         fuse_qkv_reshape_grad_op_node->Op()->GetAttr("shape"));
+    fused_attention_grad_op_desc.SetAttr("num_heads", shape[2]);
+    fused_attention_grad_op_desc.SetAttr("pre_layer_norm", true);
+    fused_attention_grad_op_desc.SetAttr("transpose_qkv_wb", true);
+
+    // forward op will use default value
+    // but backward op has to set these redundant attrs
+    fused_attention_grad_op_desc.SetAttr(
+        "ln_epsilon",
+        PADDLE_GET_CONST(
+            float, pre_layer_norm_grad_op_node->Op()->GetAttr("epsilon")));
+    fused_attention_grad_op_desc.SetAttr("ring_id", -1);
+
+    GET_IR_NODE_FROM_SUBGRAPH(qkv_matmul_grad_x_grad_node,
+                              qkv_matmul_grad_x_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_matmul_grad_x_grad_node,
+                              out_linear_matmul_grad_x_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pre_layer_norm_grad_bias_grad_node,
+                              pre_layer_norm_grad_bias_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_matmul_grad_x_grad_node,
+                              fuse_qkv_matmul_grad_x_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pre_layer_norm_grad_scale_grad_node,
+                              pre_layer_norm_grad_scale_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_ele_add_grad_bias_grad_node,
+                              out_linear_ele_add_grad_bias_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_ele_add_grad_x_grad_node,
+                              out_linear_ele_add_grad_x_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_matmul_grad_w_grad_node,
+                              out_linear_matmul_grad_w_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(qk_scale_grad_out_node,
+                              qk_scale_grad_out,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(qkv_transpose_grad_out_node,
+                              qkv_transpose_grad_out,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_ele_add_grad_bias_grad_node,
+                              fuse_qkv_ele_add_grad_bias_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_reshape_grad_out_node,
+                              fuse_qkv_reshape_grad_out,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_ele_add_grad_x_grad_node,
+                              fuse_qkv_ele_add_grad_x_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_matmul_grad_w_grad_node,
+                              fuse_qkv_matmul_grad_w_grad,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_dropout_grad_out_node,
+                              attn_dropout_grad_out,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(qk_softmax_grad_out_node,
+                              qk_softmax_grad_out,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_split_grad_out_node,
+                              fuse_qkv_split_grad_out,
+                              fused_attention_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(grad_accumulation_out_node,
+                              grad_accumulation_out,
+                              fused_attention_grad_pattern);
+    fused_attention_grad_op_desc.SetOutput(
+        "AttnDropoutOut@GRAD", {qkv_matmul_grad_x_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "FMHAOut@GRAD", {out_linear_matmul_grad_x_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "LnBias@GRAD", {pre_layer_norm_grad_bias_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "LnOut@GRAD", {fuse_qkv_matmul_grad_x_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "LnScale@GRAD", {pre_layer_norm_grad_scale_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "OutLinearBias@GRAD", {out_linear_ele_add_grad_bias_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "OutLinearOut@GRAD", {out_linear_ele_add_grad_x_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "OutLinearW@GRAD", {out_linear_matmul_grad_w_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput("QKOut@GRAD",
+                                           {qk_scale_grad_out_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "QKTVOut@GRAD", {qkv_transpose_grad_out_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "QKVBias@GRAD", {fuse_qkv_ele_add_grad_bias_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "QKVBiasOut@GRAD", {fuse_qkv_reshape_grad_out_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "QKVOut@GRAD", {fuse_qkv_ele_add_grad_x_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "QKVW@GRAD", {fuse_qkv_matmul_grad_w_grad_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "SoftmaxOut@GRAD", {attn_dropout_grad_out_node->Name()});
+    fused_attention_grad_op_desc.SetOutput("SrcMaskOut@GRAD",
+                                           {qk_softmax_grad_out_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "TransposeOut2@GRAD", {fuse_qkv_split_grad_out_node->Name()});
+    fused_attention_grad_op_desc.SetOutput(
+        "X@GRAD", {grad_accumulation_out_node->Name()});
+
+    auto fused_attention_grad_node =
+        g->CreateOpNode(&fused_attention_grad_op_desc);
+
+    IR_NODE_LINK_TO(fused_attention_grad_node, qkv_matmul_grad_x_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    out_linear_matmul_grad_x_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    pre_layer_norm_grad_bias_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    fuse_qkv_matmul_grad_x_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    pre_layer_norm_grad_scale_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    out_linear_ele_add_grad_bias_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    out_linear_ele_add_grad_x_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    out_linear_matmul_grad_w_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node, qk_scale_grad_out_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node, qkv_transpose_grad_out_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    fuse_qkv_ele_add_grad_bias_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node, fuse_qkv_reshape_grad_out_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    fuse_qkv_ele_add_grad_x_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node,
+                    fuse_qkv_matmul_grad_w_grad_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node, attn_dropout_grad_out_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node, qk_softmax_grad_out_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node, fuse_qkv_split_grad_out_node);
+    IR_NODE_LINK_TO(fused_attention_grad_node, grad_accumulation_out_node);
+
+    IR_NODE_LINK_TO(subgraph.at(x), fused_attention_grad_node);
+    IR_NODE_LINK_TO(x_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(attn_dropout_mask_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(attn_dropout_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(dropout_mask_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(fmha_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(ln_bias_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(ln_mean_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(ln_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(ln_scale_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(ln_variance_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(out_linear_bias_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(out_linear_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(out_linear_w_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(qk_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(qktv_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(qkv_bias_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(qkv_bias_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(qkv_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(qkv_w_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(softmax_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(src_mask_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(src_mask_out_node, fused_attention_grad_node);
+    IR_NODE_LINK_TO(transpose_out_2_node, fused_attention_grad_node);
 
-    GraphSafeRemoveNodes(
-        g, {post_layer_norm_grad_op_node,    residual_ele_add_grad_op_node,
-            out_linear_dropout_grad_op_node, out_linear_ele_add_grad_op_node,
-            out_linear_matmul_grad_op_node,  qkv_reshape_grad_op_node,
-            qkv_transpose_grad_op_node,      qkv_matmul_grad_op_node,
-            attn_dropout_grad_op_node,       qk_softmax_grad_op_node,
-            add_mask_ele_add_grad_op_node,   qk_scale_grad_op_node,
-            qk_matmul_grad_op_node,          fuse_qkv_split_grad_op_node,
-            fuse_qkv_transpose_grad_op_node, fuse_qkv_reshape_grad_op_node,
-            fuse_qkv_ele_add_grad_op_node,   fuse_qkv_matmul_grad_op_node,
-            pre_layer_norm_grad_op_node,     grad_accumulation_sum_op_node});
+    GraphSafeRemoveNodes(g,
+                         {residual_ele_add_grad_op_node,
+                          out_linear_dropout_grad_op_node,
+                          out_linear_ele_add_grad_op_node,
+                          out_linear_matmul_grad_op_node,
+                          qkv_reshape_grad_op_node,
+                          qkv_transpose_grad_op_node,
+                          qkv_matmul_grad_op_node,
+                          attn_dropout_grad_op_node,
+                          qk_softmax_grad_op_node,
+                          add_mask_ele_add_grad_op_node,
+                          qk_scale_grad_op_node,
+                          qk_matmul_grad_op_node,
+                          fuse_qkv_split_grad_op_node,
+                          fuse_qkv_transpose_grad_op_node,
+                          fuse_qkv_reshape_grad_op_node,
+                          fuse_qkv_ele_add_grad_op_node,
+                          fuse_qkv_matmul_grad_op_node,
+                          pre_layer_norm_grad_op_node,
+                          grad_accumulation_sum_op_node});
 
     found_fused_attention++;
   };
diff --git a/paddle/fluid/framework/ir/fused_attention_pass.h b/paddle/fluid/framework/ir/fused_attention_pass.h
index d360f7f6520d102a4331b165880d1ba996dd93dc..222900860a7bd80bcd9dec8c456fcfbcd7a99199 100644
--- a/paddle/fluid/framework/ir/fused_attention_pass.h
+++ b/paddle/fluid/framework/ir/fused_attention_pass.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -28,7 +29,7 @@ namespace patterns {
 
 // Declare patterns for multi head attention.
 // Can detect:
-// 1. Pre layer norm, post layer norm or sandwich layer norm.
+// 1. Pre layer norm or post layer norm.
 // 2. Add attn mask for qk product before the softmax or not.
 // 3. Do attn dropout or not.
 // 4. Add residual to the out linear result or not.
@@ -37,11 +38,10 @@ struct FusedAttentionPattern : public PatternBase {
       : PatternBase(pattern, name_scope, "fused_attention_pattern") {}
 
   PDNode* operator()(PDNode* x,
-                     bool pre_layer_norm,   // do pre ln or not
-                     bool post_layer_norm,  // do post ln or not
-                     bool has_attn_mask,    // add attn mask to qk or not
-                     bool do_dropout,       // dropout the softmax(qk) or not
-                     bool add_residual);    // add residual to out linear or not
+                     bool pre_layer_norm,  // do pre ln or not
+                     bool has_attn_mask,   // add attn mask to qk or not
+                     bool do_dropout,      // dropout the softmax(qk) or not
+                     bool add_residual);   // add residual to out linear or not
 
   // pre layer norm
   PATTERN_DECL_NODE(pre_layer_norm_op);
@@ -134,11 +134,10 @@ struct FusedAttentionGradPattern : public PatternBase {
       : PatternBase(pattern, name_scope, "fused_attention_pattern") {}
 
   PDNode* operator()(PDNode* x,
-                     bool pre_layer_norm,   // pre ln
-                     bool post_layer_norm,  // post ln
-                     bool has_attn_mask,    // add attn mask to qk or not
-                     bool do_dropout,       // dropout the softmax(qk) or not
-                     bool add_residual);    // add residual to out linear or not
+                     bool pre_layer_norm,  // pre ln
+                     bool has_attn_mask,   // add attn mask to qk or not
+                     bool do_dropout,      // dropout the softmax(qk) or not
+                     bool add_residual);   // add residual to out linear or not
 
   // post layer norm grad
   PATTERN_DECL_NODE(post_layer_norm_grad_op);
@@ -254,6 +253,31 @@ struct FusedAttentionGradPattern : public PatternBase {
 
 }  // namespace patterns
 
+class FusedAttentionPassCache {
+ public:
+  ir::Node* GetNodeFromCache(const std::string name) {
+    if (var_name_to_ir_node_cache_.count(name)) {
+      return var_name_to_ir_node_cache_.find(name)->second;
+    }
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The key (%d) of FusedAttentionCache does not exist.", name));
+  }
+
+  void InsertIntoCache(const std::string name, ir::Node* node) {
+    if (!var_name_to_ir_node_cache_.count(name)) {
+      var_name_to_ir_node_cache_.insert({name, node});
+    } else {
+      PADDLE_THROW(platform::errors::AlreadyExists(
+          "The key (%d) of FusedAttentionCache already exist.", name));
+    }
+  }
+
+  void ResetCache() { var_name_to_ir_node_cache_.clear(); }
+
+ private:
+  std::unordered_map<std::string, ir::Node*> var_name_to_ir_node_cache_;
+};
+
 class FusedAttentionsPass : public FusePassBase {
  public:
   virtual ~FusedAttentionsPass() {}
@@ -275,9 +299,17 @@ class FusedAttentionsPass : public FusePassBase {
   // If true, the function name will have an abbreviation part.
   // If false, the function name won't contain an abbreviation for it.
 
-  ir::Graph* PreMaskDropResPostFwd(Graph* graph) const;
+  ir::Graph* PreMaskDropResFwd(Graph* graph,
+                               FusedAttentionPassCache* cache) const;
+
+  ir::Graph* PreMaskDropResBwd(Graph* graph,
+                               FusedAttentionPassCache* cache) const;
 
-  ir::Graph* PreMaskDropResPostBwd(Graph* graph) const;
+  const std::string GenerateCacheKey(const std::string anchor,
+                                     const std::string var_name,
+                                     int block_id) const {
+    return anchor + "_" + std::to_string(block_id) + "_" + var_name;
+  }
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/groupnorm_act_pass.cc b/paddle/fluid/framework/ir/groupnorm_act_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..397a7437757cc27d21f4f39592a6afe266632c71
--- /dev/null
+++ b/paddle/fluid/framework/ir/groupnorm_act_pass.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/groupnorm_act_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct GroupNormAct : public PatternBase {
+  GroupNormAct(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "groupnorm_act") {}
+
+  void operator()(PDNode *x);
+  // declare operator node's name
+  PATTERN_DECL_NODE(group_norm);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elementwise_out);
+
+  PATTERN_DECL_NODE(group_norm_bias);
+  PATTERN_DECL_NODE(group_norm_scale);
+  PATTERN_DECL_NODE(group_norm_out);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(act_out);
+};
+
+void GroupNormAct::operator()(PDNode *x) {
+  // Create nodes for group_norm op.
+  auto *group_norm =
+      pattern->NewNode(group_norm_repr())->assert_is_op("group_norm");
+  auto *group_norm_bias_var = pattern->NewNode(group_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("group_norm", "Bias");
+
+  auto *group_norm_scale_var = pattern->NewNode(group_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("group_norm", "Scale");
+
+  auto *group_norm_out_var = pattern->NewNode(group_norm_out_repr())
+                                 ->AsOutput()
+                                 ->assert_is_op_output("group_norm", "Y")
+                                 ->assert_is_op_input("silu", "X");
+
+  // Add links for group_norm op.
+  group_norm->LinksFrom({x, group_norm_bias_var, group_norm_scale_var})
+      .LinksTo({group_norm_out_var});
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_op("silu");
+  auto *act_out = pattern->NewNode(act_out_repr())
+                      ->AsOutput()
+                      ->assert_is_op_output("silu", "Out");
+
+  act->LinksFrom({group_norm_out_var}).LinksTo({act_out});
+}
+
+}  // namespace patterns
+
+int GroupNormActFusePass::ApplyGNSiluPattern(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init("groupnorm_silu_fuse", graph);
+
+  int found_subgraph_count = 0;
+
+  GraphPatternDetector gpd;
+  PDNode *x = nullptr;
+
+  x = gpd.mutable_pattern()
+          ->NewNode("groupnorm_act_fuse/x")
+          ->AsInput()
+          ->assert_var_not_persistable()
+          ->assert_is_op_input("group_norm", "X");
+
+  patterns::GroupNormAct fused_pattern(gpd.mutable_pattern(),
+                                       "groupnorm_act_fuse");
+  fused_pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    if (subgraph.count(x) <= 0) {
+      LOG(WARNING) << "The subgraph is empty.";
+      return;
+    }
+
+    VLOG(4) << "handle groupnorm act fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(group_norm, group_norm, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(group_norm_bias, group_norm_bias, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        group_norm_scale, group_norm_scale, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(group_norm_out, group_norm_out, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, act, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, fused_pattern);
+
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "groupnorm act pass in op compat failed.";
+      return;
+    }
+
+    std::unordered_set<const Node *> del_node_set;
+    // Create an skip_groupnorm_act op node
+    OpDesc new_desc(*group_norm->Op());
+    new_desc.SetAttr("with_silu", true);
+    new_desc.SetOutput("Y", {act_out->Name()});
+    new_desc.Flush();
+
+    auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
+
+    del_node_set.insert(group_norm);
+    del_node_set.insert(group_norm_out);
+    del_node_set.insert(act);
+    GraphSafeRemoveNodes(graph, del_node_set);
+
+    IR_NODE_LINK_TO(subgraph.at(x), fused_node);
+    IR_NODE_LINK_TO(group_norm_scale, fused_node);
+    IR_NODE_LINK_TO(group_norm_bias, fused_node);
+    IR_NODE_LINK_TO(fused_node, act_out);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  return found_subgraph_count;
+}
+
+void GroupNormActFusePass::ApplyImpl(ir::Graph *graph) const {
+  FusePassBase::Init("groupnorm_act_fuse_pass", graph);
+  int found_subgraph_count = ApplyGNSiluPattern(graph);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(groupnorm_act_pass, paddle::framework::ir::GroupNormActFusePass);
+REGISTER_PASS_CAPABILITY(groupnorm_act_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("silu", 0)
+            .EQ("group_norm", 0));
diff --git a/paddle/fluid/framework/ir/groupnorm_act_pass.h b/paddle/fluid/framework/ir/groupnorm_act_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..16e4d332d29f07ab56419e6da47645b05e8fa164
--- /dev/null
+++ b/paddle/fluid/framework/ir/groupnorm_act_pass.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+//
+//       |                             |
+//   group_norm                   group_norm
+//       |              ->             |
+//      silu
+//       |
+
+class Graph;
+
+class GroupNormActFusePass : public FusePassBase {
+ public:
+  GroupNormActFusePass() {
+    AddOpCompat(OpCompat("group_norm"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Scale")
+        .IsTensor()
+        .End()
+        .AddInput("Bias")
+        .IsTensor()
+        .End()
+        .AddOutput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Mean")
+        .IsTensor()
+        .End()
+        .AddOutput("Variance")
+        .IsTensor()
+        .End()
+        .AddAttr("epsilon")
+        .IsNumGE(0.0f)
+        .IsNumLE(1.0f)
+        .End()
+        .AddAttr("groups")
+        .IsNumGE(1)
+        .End()
+        .AddAttr("data_layout")
+        .IsStringIn({"NCHW"})
+        .End();
+    AddOpCompat(OpCompat("silu"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddOutput("Out")
+        .IsTensor()
+        .End();
+  }
+
+  virtual ~GroupNormActFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+  int ApplyGNSiluPattern(ir::Graph* graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 1cd1d0325ae446b9d2aa3eb39e6db5d41324fe9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/phi/common/place.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog,
-           const std::string& type,
-           const std::string& name,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "conv2d") {
-    const std::vector<int> strides({1, 1});
-    const std::vector<int> paddings({0, 0});
-    const std::vector<int> dilations({1, 1});
-    op->SetAttr("use_mkldnn", true);
-    op->SetAttr("name", name);
-    op->SetAttr("strides", strides);
-    op->SetAttr("groups", 1);
-    op->SetAttr("paddings", paddings);
-    op->SetAttr("padding_algorithm", std::string("EXPLICIT"));
-    op->SetAttr("dilations", dilations);
-    op->SetAttr("data_format", std::string("NCHW"));
-
-    op->SetOutput("Output", outputs);
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    if (inputs.size() > 2)
-      op->SetInput("Bias", {inputs[2]});
-    else
-      op->SetInput("Bias", {});
-  } else if (type == "elementwise_add") {
-    op->SetAttr("use_mkldnn", true);
-    op->SetAttr("axis", 1);
-    op->SetInput("X", {inputs[0]});
-    op->SetInput("Y", {inputs[1]});
-    op->SetOutput("Out", outputs);
-  }
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-// (c, weights)->conv->f
-// (f)->elementwise_add->g
-ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
-  ProgramDesc prog;
-  std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
-  if (convWithExistingBias) nodes.push_back("conv_bias");
-  for (auto& v : nodes) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::LOD_TENSOR);
-    if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
-      var->SetPersistable(true);
-    }
-  }
-
-  // conv+bias, both with MKL-DNN
-  if (convWithExistingBias) {
-    SetOp(&prog,
-          "conv2d",
-          "conv",
-          std::vector<std::string>({"c", "weights", "conv_bias"}),
-          std::vector<std::string>({"f"}));
-  } else {
-    SetOp(&prog,
-          "conv2d",
-          "conv",
-          std::vector<std::string>({"c", "weights"}),
-          std::vector<std::string>({"f"}));
-  }
-  SetOp(&prog,
-        "elementwise_add",
-        "eltwise",
-        std::vector<std::string>({"f", "eltwise_bias"}),
-        std::vector<std::string>({"g"}));
-
-  return prog;
-}
-
-void InitTensorHolder(Scope* scope,
-                      const paddle::platform::Place& place,
-                      const char* var_name) {
-  auto x = scope->Var(var_name);
-  auto tensor = x->GetMutable<phi::DenseTensor>();
-  tensor->mutable_data(
-      place, framework::TransToPhiDataType(proto::VarType::FP32), 1);
-}
-
-void MainTest(bool convWithExistingBias) {
-  auto prog = BuildProgramDesc(convWithExistingBias);
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  auto place = phi::CPUPlace();
-  NaiveExecutor exe{place};
-  Scope scope;
-  // Init scope, as it is used in pass
-  exe.CreateVariables(prog, 0, true, &scope);
-  if (convWithExistingBias) {
-    InitTensorHolder(&scope, place, "conv_bias");
-    InitTensorHolder(&scope, place, "eltwise_bias");
-  }
-  graph->SetNotOwned(kParamScopeAttr, &scope);
-
-  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
-
-  int original_nodes_num = graph->Nodes().size();
-
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  // Remove 3 Nodes: Conv, Bias, conv_out
-  // Add 1 Node: ConvBias
-  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
-
-  // Assert conv_bias op in newly generated graph
-  int conv_bias_count = 0;
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && (node->Op()->Type() == "conv2d" ||
-                         node->Op()->Type() == "fused_conv2d")) {
-      auto* op = node->Op();
-      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
-      EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      // check if "conv" convolution is fused
-      auto op_name = PADDLE_GET_CONST(std::string, op->GetAttr("name"));
-      if (op_name == "conv") {
-        auto input_names = op->InputNames();
-        ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
-                    input_names.end());
-        auto bias = op->Input("Bias");
-        if (bias.size()) {
-          ++conv_bias_count;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(conv_bias_count, 1);
-}
-
-TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
-
-TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
-
-TEST(ConvBiasFusePass, conv3d) {
-  Conv3DBiasFusePass pass;
-  ASSERT_EQ(pass.type(), std::string("conv3d"));
-}
-
-TEST(ConvBiasFusePass, conv2d_transpose) {
-  Conv2DTransposeBiasFusePass pass;
-  ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
-}
-
-TEST(ConvBiasFusePass, pass_op_version_check) {
-  ASSERT_TRUE(
-      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
-          .IsPassCompatible("conv_bias_mkldnn_fuse_pass"));
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 6ab8708c7ae1f8f31f1b5d1e0edcd3ca9383cd2e..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog,
-           const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs,
-           bool use_mkldnn = true) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetAttr("fuse_activation", std::string(""));
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-    if (inputs.size() > 2) {
-      op->SetInput("Bias", {inputs[2]});
-    }
-    op->SetOutput("Output", outputs);
-  } else if (type == "relu") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  } else if (type == "pool2d") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  } else if (type == "concat") {
-    op->SetAttr("use_mkldnn", use_mkldnn);
-    op->SetAttr("axis", 0);
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-  }
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-}
-
-// (a1,w1)->conv1->c1
-// (a2,w2,b2)->conv2->c2
-// if put_only_convs_before_concat=true
-//   (a3,w3)->conv3->c3
-// else
-//   a3->pool1->c3
-//
-// (c1,c2,c3)->concat1->d
-// d->relu1->e
-ProgramDesc BuildProgramDesc(bool put_only_convs_before_concat,
-                             bool all_convs_use_mkldnn) {
-  ProgramDesc prog;
-  for (auto& v : std::initializer_list<std::string>({"a1",
-                                                     "w1",
-                                                     "c1",
-                                                     "a2",
-                                                     "w2",
-                                                     "b2",
-                                                     "c2",
-                                                     "a3",
-                                                     "w3",
-                                                     "c3",
-                                                     "d",
-                                                     "e"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v.find("w") == 0 || v.find("b") == 0) {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "conv2d", {"a1", "w1", "b1"}, {"c1"}, all_convs_use_mkldnn);
-  SetOp(&prog, "conv2d", {"a2", "w2", "b2"}, {"c2"});
-  if (put_only_convs_before_concat) {
-    SetOp(&prog, "conv2d", {"a3", "w3", "b3"}, {"c3"});
-  } else {
-    SetOp(&prog, "pool2d", {"a3"}, {"c3"});
-  }
-  SetOp(&prog, "concat", {"c1", "c2", "c3"}, {"d"});
-  SetOp(&prog, "relu", {"d"}, {"e"});
-
-  return prog;
-}
-
-void MainTest(const ProgramDesc& prog, bool fuse_relu) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  int original_nodes_num = graph->Nodes().size();
-
-  auto pass = PassRegistry::Instance().Get("conv_activation_mkldnn_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  if (fuse_relu) {
-    // Remove 2 nodes: concat_out, relu
-    EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
-  } else {
-    EXPECT_EQ(original_nodes_num, current_nodes_num);
-  }
-
-  int relu_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "conv2d") {
-        ASSERT_TRUE(op->HasAttr("fuse_activation"));
-        bool fuse_relu_attr =
-            (PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation")) ==
-             "relu");
-        EXPECT_EQ(fuse_relu, fuse_relu_attr);
-      } else if (op->Type() == "relu") {
-        relu_count++;
-      }
-    }
-  }
-  EXPECT_EQ(relu_count, fuse_relu ? 0 : 1);
-}
-
-TEST(ConvConcatReLUFusePass, only_convs_before_concat) {
-  bool all_convs_use_mkldnn = true;
-  bool put_only_convs_before_concat = true;
-  auto prog =
-      BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn);
-
-  bool expect_relu_fuse = true;
-  MainTest(prog, expect_relu_fuse);
-}
-
-TEST(ConvConcatReLUFusePass, only_convs_before_concat_but_one_non_mkldnn) {
-  bool all_convs_use_mkldnn = false;
-  bool put_only_convs_before_concat = true;
-  auto prog =
-      BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn);
-
-  bool expect_relu_fuse = false;
-  MainTest(prog, expect_relu_fuse);
-}
-
-TEST(ConvConcatReLUFusePass, convs_and_pool_before_concat) {
-  bool all_convs_use_mkldnn = true;
-  bool put_only_convs_before_concat = false;
-  auto prog =
-      BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn);
-
-  bool expect_relu_fuse = false;
-  MainTest(prog, expect_relu_fuse);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_activation_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 173211e0a2ec9e9d3a5dab452ab63e3106fc4152..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
-#include "paddle/fluid/framework/ir/pass_test_util.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-constexpr int nodes_removed = 3;
-constexpr int nodes_added = 1;
-
-OpDesc* Create_Op_con2d(ProgramDesc* prog,
-                        const std::string& op_type_name,
-                        const std::vector<test::InOutVarNamePair>& inputs,
-                        const std::vector<test::InOutVarNamePair>& outputs,
-                        const bool use_mkldnn = true) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({0, 0});
-  const std::vector<int> dilations({1, 1});
-  op->SetType(op_type_name);
-  op->SetAttr("use_mkldnn", use_mkldnn);
-  op->SetAttr("strides", strides);
-  op->SetAttr("groups", 1);
-  op->SetAttr("paddings", paddings);
-  op->SetAttr("padding_algorithm", std::string("EXPLICIT"));
-  op->SetAttr("dilations", dilations);
-  op->SetAttr("data_format", std::string("NCHW"));
-
-  for (const auto& input : inputs) {
-    op->SetInput(input.first, {input.second});
-  }
-  for (const auto& output : outputs) {
-    op->SetOutput(output.first, {output.second});
-  }
-
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-  return op;
-}
-
-OpDesc* Create_Op_elemntwise_add(
-    ProgramDesc* prog,
-    const std::string& op_type_name,
-    const std::vector<test::InOutVarNamePair>& inputs,
-    const std::vector<test::InOutVarNamePair>& outputs,
-    bool use_mkldnn = true) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(op_type_name);
-  op->SetAttr("use_mkldnn", use_mkldnn);
-  op->SetAttr("axis", -1);
-
-  for (const auto& input : inputs) {
-    op->SetInput(input.first, {input.second});
-  }
-  for (const auto& output : outputs) {
-    op->SetOutput(output.first, {output.second});
-  }
-
-  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
-              static_cast<int>(OpRole::kForward));
-  return op;
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
-  auto prog =
-      test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
-
-  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-                  {{"Output", "c"}});
-  Create_Op_elemntwise_add(
-      &prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {{"Out", "d"}});
-  test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}});
-
-  Graph graph(prog);
-
-  EXPECT_TRUE(test::RunPassAndAssert(&graph,
-                                     "conv_elementwise_add_mkldnn_fuse_pass",
-                                     "a",
-                                     "relu",
-                                     nodes_removed,
-                                     nodes_added));
-  EXPECT_TRUE(test::AssertOpsCount(
-      graph, {{"fused_conv2d", 1}, {"elementwise_add", 0}}));
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass,
-     ConvolutionProjectionAsYWithElementwiseAddRelu) {
-  auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e", "f"},
-                                     {"bias", "weights", "bias2", "weights2"});
-
-  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
-  // right branch
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-                  {{"Output", "c"}});
-
-  // left branch
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}},
-                  {{"Output", "f"}});
-
-  Create_Op_elemntwise_add(
-      &prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, {{"Out", "d"}});
-  test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}});
-
-  Graph graph(prog);
-
-  EXPECT_TRUE(test::RunPassAndAssert(&graph,
-                                     "conv_elementwise_add_mkldnn_fuse_pass",
-                                     "a",
-                                     "relu",
-                                     nodes_removed,
-                                     nodes_added));
-  EXPECT_TRUE(test::AssertOpsCount(
-      graph, {{"conv2d", 1}, {"fused_conv2d", 1}, {"elementwise_add", 0}}));
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass,
-     ConvolutionAsYWithElementwiseAddReluNoBias) {
-  auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
-
-  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "b"}, {"Filter", "weights"}},
-                  {{"Output", "c"}});
-  Create_Op_elemntwise_add(
-      &prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {{"Out", "d"}});
-  test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}});
-
-  Graph graph(prog);
-
-  EXPECT_TRUE(test::RunPassAndAssert(&graph,
-                                     "conv_elementwise_add_mkldnn_fuse_pass",
-                                     "a",
-                                     "relu",
-                                     nodes_removed,
-                                     nodes_added));
-  EXPECT_TRUE(test::AssertOpsCount(
-      graph, {{"fused_conv2d", 1}, {"elementwise_add", 0}}));
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
-  auto prog =
-      test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
-
-  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-                  {{"Output", "c"}});
-
-  Create_Op_elemntwise_add(
-      &prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {{"Out", "d"}});
-  test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}});
-
-  Graph graph(prog);
-
-  EXPECT_TRUE(test::RunPassAndAssert(&graph,
-                                     "conv_elementwise_add_mkldnn_fuse_pass",
-                                     "a",
-                                     "relu",
-                                     nodes_removed,
-                                     nodes_added));
-  EXPECT_TRUE(test::AssertOpsCount(
-      graph, {{"fused_conv2d", 1}, {"elementwise_add", 0}}));
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass,
-     ConvolutionAsXWithElementwiseAddReluNoBias) {
-  auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
-
-  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "b"}, {"Filter", "weights"}},
-                  {{"Output", "c"}});
-  Create_Op_elemntwise_add(
-      &prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {{"Out", "d"}});
-  test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}});
-
-  Graph graph(prog);
-
-  EXPECT_TRUE(test::RunPassAndAssert(&graph,
-                                     "conv_elementwise_add_mkldnn_fuse_pass",
-                                     "a",
-                                     "relu",
-                                     nodes_removed,
-                                     nodes_added));
-  EXPECT_TRUE(test::AssertOpsCount(
-      graph, {{"fused_conv2d", 1}, {"elementwise_add", 0}}));
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
-  auto prog =
-      test::BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
-
-  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "b"}, {"Filter", "weights"}},
-                  {{"Output", "c"}});
-
-  Create_Op_con2d(&prog,
-                  "conv2d",
-                  {{"Input", "d"}, {"Filter", "weights"}},
-                  {{"Output", "e"}});
-
-  Create_Op_elemntwise_add(
-      &prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {{"Out", "f"}});
-  test::CreateOp(&prog, "relu", {{"X", "f"}}, {{"Out", "g"}});
-
-  Graph graph(prog);
-
-  EXPECT_TRUE(test::RunPassAndAssert(
-      &graph, "conv_elementwise_add_mkldnn_fuse_pass", "a", "g", 0, 0));
-  EXPECT_TRUE(
-      test::AssertOpsCount(graph, {{"conv2d", 2}, {"elementwise_add", 1}}));
-}
-
-TEST(ConvElementwiseAddMKLDNNFusePass, pass_op_version_check) {
-  ASSERT_TRUE(
-      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
-          .IsPassCompatible("conv_elementwise_add_mkldnn_fuse_pass"));
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_elementwise_add_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
index 50db74e46d1d6929b84b9fb89b11f48c485a8e25..61bd888715c702fe8974dc93a36626a65a715497 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ using string::PrettyLogDetail;
 
 void MatmulActivationMkldnnFusePass::ApplyImpl(Graph* graph) const {
   auto act_types = GetSupportedActivations();
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"fused_matmul", "matmul", "matmul_v2"};
 
   for (const auto& matmul_type : matmul_types)
     for (auto& act_type : act_types) {
@@ -61,8 +61,17 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
     GET_IR_NODE_FROM_SUBGRAPH(
         activation_out, activation_out, matmul_act_pattern);
 
-    SetActivationAttrs(matmul->Op(), activation->Op(), act_type);
-    matmul->Op()->SetOutput("Out", {activation_out->Name()});
+    OpDesc* matmul_op = matmul->Op();
+
+    matmul_op->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul_op->SetAttr("trans_x", matmul_op->GetAttr("transpose_X"));
+      matmul_op->SetAttr("trans_y", matmul_op->GetAttr("transpose_Y"));
+      matmul_op->SetAttr("matmul_alpha", matmul_op->GetAttr("alpha"));
+    }
+
+    SetActivationAttrs(matmul_op, activation->Op(), act_type);
+    matmul_op->SetOutput("Out", {activation_out->Name()});
 
     IR_NODE_LINK_TO(matmul, activation_out);
     GraphSafeRemoveNodes(graph, {activation, matmul_out});
@@ -88,11 +97,6 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
       .AddInput("Y")
       .IsTensor()
       .End()
-      .AddInput(
-          "ResidualData")  // Extra tensor used in matmul+elementwise_add fuse
-      .IsTensor()
-      .IsOptional()
-      .End()
       .AddOutput("Out")
       .IsTensor()
       .End()
@@ -113,8 +117,24 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
       .AddInput("Y")
       .IsTensor()
       .End()
-      .AddInput(
-          "ResidualData")  // Extra tensor used in matmul+elementwise_add fuse
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
       .IsTensor()
       .IsOptional()
       .End()
@@ -126,6 +146,50 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
       .End()
       .AddAttr("trans_y")
       .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
       .End();
 
   AddOpCompat(OpCompat("abs"))
@@ -279,6 +343,7 @@ REGISTER_PASS(matmul_activation_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(matmul_activation_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .EQ("abs", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
index f045377465e0322207d2d5ebdb888f74878e8d43..680600a403251548dc47a416d2786653e19bf630 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void MatmulElementwiseAddMKLDNNFusePass::ApplyImpl(Graph* graph) const {
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"fused_matmul", "matmul", "matmul_v2"};
   auto matmul_as_x = {true, false};
 
   for (const auto& matmul_type : matmul_types)
@@ -65,6 +65,12 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd(
       return;
     }
 
+    matmul->Op()->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul->Op()->SetAttr("trans_x", matmul->Op()->GetAttr("transpose_X"));
+      matmul->Op()->SetAttr("trans_y", matmul->Op()->GetAttr("transpose_Y"));
+      matmul->Op()->SetAttr("matmul_alpha", matmul->Op()->GetAttr("alpha"));
+    }
     matmul->Op()->SetInput("ResidualData", {elementwise_addend->Name()});
     matmul->Op()->SetOutput("Out", {elementwise_add_out->Name()});
 
@@ -125,6 +131,71 @@ MatmulElementwiseAddMKLDNNFusePass::MatmulElementwiseAddMKLDNNFusePass() {
       .IsType<bool>()
       .End();
 
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End();
+
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
       .IsTensor()
@@ -149,6 +220,7 @@ REGISTER_PASS(matmul_elementwise_add_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(matmul_elementwise_add_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .LE("elementwise_add", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
index 40dbaa03a0615f1456c6530ed1340741d443f193..779c39834c6e3a1c04bd60610208d2ae56fbf252 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(Graph *graph) const {
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"fused_matmul", "matmul", "matmul_v2"};
 
   for (const auto &matmul_type : matmul_types) {
     Fuse(graph, matmul_type);
@@ -84,6 +84,12 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse(
     }
 
     OpDesc *matmul_desc = matmul_op->Op();
+    matmul_desc->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X"));
+      matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y"));
+      matmul_desc->SetAttr("matmul_alpha", matmul_desc->GetAttr("alpha"));
+    }
     matmul_desc->SetOutput("Out", {reshape_out->Name()});
     matmul_desc->SetAttr("fused_reshape_Out", reshape_shape);
     matmul_desc->SetAttr("fused_transpose_Out", transpose_axis);
@@ -149,6 +155,71 @@ MatmulTransposeReshapeMKLDNNPass::MatmulTransposeReshapeMKLDNNPass() {
       .IsType<bool>()
       .End();
 
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End();
+
   AddOpCompat(OpCompat("transpose2"))
       .AddInput("X")
       .IsTensor()
@@ -189,6 +260,7 @@ REGISTER_PASS(matmul_transpose_reshape_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .EQ("transpose2", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
index cb06f6eb1205e94d0a1861183014edfc1a67de02..579764355d86cde4de363b9300e25f5b058d8a15 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@ using string::PrettyLogDetail;
 void FuseOperatorScaleOneDNNPass::ApplyImpl(Graph *graph) const {
   const std::vector<std::string> fusable_ops{
       "fc",
+      "fused_matmul",
       "matmul",
       "matmul_v2",
       "elementwise_add",
@@ -85,6 +86,19 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
       scale = *(scale_tensor->data<float>());
     }
 
+    if (op_type == "matmul") {
+      operator_op->Op()->SetType("fused_matmul");
+      operator_op->Op()->SetAttr("trans_x",
+                                 operator_op->Op()->GetAttr("transpose_X"));
+      operator_op->Op()->SetAttr("trans_y",
+                                 operator_op->Op()->GetAttr("transpose_Y"));
+      operator_op->Op()->SetAttr("matmul_alpha",
+                                 operator_op->Op()->GetAttr("alpha"));
+    }
+    if (op_type == "matmul_v2") {
+      operator_op->Op()->SetType("fused_matmul");
+    }
+
     operator_op->Op()->SetAttr("fused_output_scale", scale);
     operator_op->Op()->SetOutput("Out", {scale_out->Name()});
 
@@ -111,6 +125,7 @@ REGISTER_PASS_CAPABILITY(operator_scale_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("fc", 0)
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .LE("elementwise_add", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
index 25a79509b53f531ce53cd354bea1e16f9680f5c0..508cad94e8136eca50afa0e6c27503aa9335511c 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ namespace framework {
 namespace ir {
 
 void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(Graph *graph) const {
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"matmul", "matmul_v2", "fused_matmul"};
 
   for (const auto &matmul_type : matmul_types) {
     Fuse(graph,
@@ -102,6 +102,25 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
                                               matmul_type + " encountered.");
     }
 
+    // Return if input of fused_matmul is already fused
+    if (matmul_type == "fused_matmul") {
+      auto is_already_fused_X =
+          matmul_desc->HasAttr("fused_reshape_X")
+              ? !(PADDLE_GET_CONST(std::vector<int>,
+                                   matmul_desc->GetAttr("fused_reshape_X"))
+                      .empty())
+              : false;
+      if (is_already_fused_X && matmul_input_name == "X") return;
+
+      auto is_already_fused_Y =
+          matmul_desc->HasAttr("fused_reshape_Y")
+              ? !(PADDLE_GET_CONST(std::vector<int>,
+                                   matmul_desc->GetAttr("fused_reshape_Y"))
+                      .empty())
+              : false;
+      if (is_already_fused_Y && matmul_input_name == "Y") return;
+    }
+
     auto reshape_shape =
         paddle::get<std::vector<int>>(reshape_op->Op()->GetAttr("shape"));
     auto transpose_axis =
@@ -123,6 +142,12 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
       return;
     }
 
+    matmul_desc->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X"));
+      matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y"));
+      matmul_desc->SetAttr("matmul_alpha", matmul_desc->GetAttr("alpha"));
+    }
     matmul_desc->SetInput(matmul_input_name, {(reshape_in)->Name()});
     matmul_desc->SetAttr("fused_reshape_" + matmul_input_name, reshape_shape);
     matmul_desc->SetAttr("fused_transpose_" + matmul_input_name,
@@ -220,6 +245,71 @@ ReshapeTransposeMatmulMkldnnFusePass::ReshapeTransposeMatmulMkldnnFusePass() {
       .AddAttr("trans_y")
       .IsType<bool>()
       .End();
+
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End();
 }
 
 }  // namespace ir
@@ -234,5 +324,6 @@ REGISTER_PASS_CAPABILITY(reshape_transpose_matmul_mkldnn_fuse_pass)
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("reshape2", 0)
             .EQ("transpose2", 0)
+            .EQ("fused_matmul", 0)
             .EQ("matmul", 1)
             .EQ("matmul_v2", 0));
diff --git a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
index 13b7b4ac72f96b80ec66459b1f6763332d4aa8be..48baf1f4b102fc4376e39e11dbdede539f02c3a5 100644
--- a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
@@ -129,6 +129,24 @@ void PrelnResidualBias::operator()(PDNode *x, PDNode *y) {
 
 }  // namespace patterns
 
+void setIntermediateOut(OpDesc *desc,
+                        const std::string &out_name,
+                        const std::string &scope_name) {
+  std::string new_name = scope_name + "/at." + out_name + ".new";
+  desc->SetOutput(out_name, {new_name});
+}
+
+void addIntermediateOut(Node *op_node,
+                        const std::string &out_name,
+                        const std::string &scope_name,
+                        Graph *graph) {
+  std::string new_name = scope_name + "/at." + out_name + ".new";
+  VarDesc out_var(new_name);
+  out_var.SetPersistable(false);
+  auto *node_var = graph->CreateVarNode(&out_var);
+  IR_NODE_LINK_TO(op_node, node_var);
+}
+
 int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
                                             bool with_bias) const {
   PADDLE_ENFORCE_NOT_NULL(
@@ -207,7 +225,7 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
     // on each other, so we make below check to ensure only one
     // PrelnResidualBias pattern is delalted with.
     for (auto op : elementwise1_out->inputs) {
-      if (op->Name() == "preln_residual_bias") return;
+      if (op->Name() == "fused_bias_dropout_residual_layer_norm") return;
     }
 
     if (!IsCompat(subgraph, graph)) {
@@ -218,31 +236,37 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
     std::unordered_set<const Node *> del_node_set;
     // Create an PrelnResidualBias op node
     OpDesc new_desc;
-    new_desc.SetType("preln_residual_bias");
+    new_desc.SetType("fused_bias_dropout_residual_layer_norm");
     // inputs
     new_desc.SetInput("X", {subgraph.at(x)->Name()});
-    new_desc.SetInput("Y", {subgraph.at(y)->Name()});
-    new_desc.SetInput("Scale", {layer_norm_scale->Name()});
-    new_desc.SetInput("Bias", {layer_norm_bias->Name()});
+    new_desc.SetInput("Residual", {subgraph.at(y)->Name()});
+    new_desc.SetInput("LnScale", {layer_norm_scale->Name()});
+    new_desc.SetInput("LnBias", {layer_norm_bias->Name()});
     if (with_bias) {
-      new_desc.SetInput("EleBias", {elementwise_bias->Name()});
+      new_desc.SetInput("Bias", {elementwise_bias->Name()});
     }
     // outputs
-    new_desc.SetOutput("Out_0", {layer_norm_out->Name()});
-    new_desc.SetOutput("Out_1", {elementwise1_out->Name()});
+    new_desc.SetOutput("Y", {layer_norm_out->Name()});
+    new_desc.SetOutput("BiasDropoutResidualOut", {elementwise1_out->Name()});
+    new_desc.SetOutput("LnMean", {layer_norm_mean->Name()});
+    new_desc.SetOutput("LnVariance", {layer_norm_variance->Name()});
+    setIntermediateOut(&new_desc, "DropoutMaskOut", "preln_residual_bias_fuse");
     // attrs
-    new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("ln_epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("dropout_rate", 0.0f);
+    new_desc.SetAttr("is_test", true);
     new_desc.SetAttr("begin_norm_axis",
                      layer_norm->Op()->GetAttr("begin_norm_axis"));
     auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
+    addIntermediateOut(
+        fused_node, "DropoutMaskOut", "preln_residual_bias_fuse", graph);
+
     if (with_bias) {
       del_node_set.insert(elementwise0);
       del_node_set.insert(elementwise0_out);
     }
     del_node_set.insert(elementwise1);
     del_node_set.insert(layer_norm);
-    del_node_set.insert(layer_norm_mean);
-    del_node_set.insert(layer_norm_variance);
     GraphSafeRemoveNodes(graph, del_node_set);
     IR_NODE_LINK_TO(subgraph.at(x), fused_node);
     IR_NODE_LINK_TO(subgraph.at(y), fused_node);
@@ -253,6 +277,9 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
     IR_NODE_LINK_TO(layer_norm_bias, fused_node);
     IR_NODE_LINK_TO(fused_node, layer_norm_out);
     IR_NODE_LINK_TO(fused_node, elementwise1_out);
+    IR_NODE_LINK_TO(fused_node, layer_norm_mean);
+    IR_NODE_LINK_TO(fused_node, layer_norm_variance);
+
     found_subgraph_count++;
   };
 
@@ -261,6 +288,8 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
 }
 
 void PrelnResidualBiasFusePass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(1) << "Fuse PrelnResidualBias into "
+             "fused_bias_dropout_residual_layer_norm op with dropout rate = 0";
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init("preln_residual_bias_fuse", graph);
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
index db023746ac4c7931ea1badf959fdc16d0e0800e0..18ea8850dc5bfb15afa5584a2d6241ba2da8e9ed 100644
--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -170,7 +170,7 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     // attrs
     new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
 
-    if (new_desc.HasAttr("begin_norm_axis")) {
+    if (layer_norm->Op()->HasAttr("begin_norm_axis")) {
       int32_t begin_norm_axis = PADDLE_GET_CONST(
           int32_t, layer_norm->Op()->GetAttr("begin_norm_axis"));
       int32_t input_rank =
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
index ec6ed90d08a389d575b7765d1160563cdb9bfd33..babac78146f406a7911656dbefc35549313b3a31 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -69,9 +69,15 @@ const std::map<size_t, std::set<size_t>>& DependencyBuilder::Build(
   instructions_ = &instructions;
   op_num_ = instructions_->size();
 
+  ops_before_.assign(op_num_, {});
+  ops_behind_.assign(op_num_, {});
+  op_happens_before_.assign(op_num_, std::vector<bool>(op_num_, false));
+
   BuildDownstreamMap();
-  BuildOpHappensBefore();
+  VLOG(6) << "Finish BuildDownstreamMap";
+
   ShrinkDownstreamMap();
+  VLOG(6) << "Finish ShrinkDownstreamMap";
 
   if (FLAGS_new_executor_sequential_run) {
     AddDependencyForSequentialRun();
@@ -81,18 +87,22 @@ const std::map<size_t, std::set<size_t>>& DependencyBuilder::Build(
 
   if (FLAGS_add_dependency_for_communication_op) {
     AddDependencyForCommunicationOp();
+    VLOG(6) << "Finish AddDependencyForSequentialRun";
   }
 
   AddDependencyForRandomOp();
-  AddDependencyForReadOp();
+  VLOG(6) << "Finish AddDependencyForRandomOp";
 
-  is_build_ = true;
+  AddDependencyForReadOp();
+  VLOG(6) << "Finish AddDependencyForReadOp";
 
-  VLOG(8) << "Finish build dependency";
+  VLOG(6) << "Finish build dependency";
   VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_);
   VLOG(8) << "downstream_map: " << std::endl
           << StringizeDownstreamMap(op_downstream_map_);
 
+  is_build_ = true;
+
   return op_downstream_map_;
 }
 
@@ -106,15 +116,6 @@ const std::map<size_t, std::set<size_t>>& DependencyBuilder::OpDownstreamMap()
   return op_downstream_map_;
 }
 
-bool DependencyBuilder::OpHappensBefore(size_t prior_op_idx,
-                                        size_t posterior_op_idx) const {
-  PADDLE_ENFORCE_GE(
-      op_happens_before_.size(),
-      0,
-      phi::errors::Unavailable("op_happen_before is not yet built"));
-  return op_happens_before_.at(prior_op_idx).at(posterior_op_idx);
-}
-
 void DependencyBuilder::AddDependencyForCoalesceTensorOp() {
   for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) {
     if (instructions_->at(op_idx).OpBase()->Type() == kCoalesceTensor) {
@@ -287,7 +288,7 @@ void DependencyBuilder::AddDependencyForReadOp() {
   for (size_t read_op_idx : read_ops) {
     for (size_t downstream_op_idx : startup_ops) {
       if (read_op_idx != downstream_op_idx &&
-          !op_happens_before_[downstream_op_idx][read_op_idx]) {
+          !OpHappensBefore(downstream_op_idx, read_op_idx)) {
         AddDownstreamOp(read_op_idx, downstream_op_idx);
       }
     }
@@ -308,42 +309,56 @@ void DependencyBuilder::AddDependencyForSequentialRun() {
 
 void DependencyBuilder::AddDownstreamOp(size_t prior_op_idx,
                                         size_t posterior_op_idx) {
-  std::set<size_t>& downstream_ops = op_downstream_map_[prior_op_idx];
+  PADDLE_ENFORCE_EQ(
+      OpHappensBefore(posterior_op_idx, prior_op_idx),
+      false,
+      phi::errors::Unavailable(
+          "Can not add dependency %d->%d because %d is run before %d",
+          prior_op_idx,
+          posterior_op_idx,
+          posterior_op_idx,
+          prior_op_idx));
 
-  if (op_happens_before_.size() != 0) {
-    PADDLE_ENFORCE_EQ(
-        op_happens_before_[posterior_op_idx][prior_op_idx],
-        false,
-        phi::errors::Unavailable(
-            "Can not add dependency %d->%d because %d is run before %d",
-            prior_op_idx,
-            posterior_op_idx,
-            posterior_op_idx,
-            prior_op_idx));
-
-    for (size_t op_idx : downstream_ops) {
-      if (op_happens_before_[op_idx][posterior_op_idx]) {
-        VLOG(7) << "Find dependencies " << prior_op_idx << "->" << op_idx
-                << "->" << posterior_op_idx << ", skip adding " << prior_op_idx
-                << "->" << posterior_op_idx;
-        return;
-      }
+  std::set<size_t>& downstream_ops = op_downstream_map_[prior_op_idx];
+  // NOTE(Ruibiao): Here the downstream map shrinking is best-effort, therefore
+  // ShrinkDownstreamMap after BuildDownstreamMap is still helpful. For example,
+  // a->c will not be shrinked in the following case: AddDownstreamOp(a, b) ->
+  // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrinked by
+  // ShrinkDownstreamMap.
+  for (size_t op_idx : downstream_ops) {
+    if (OpHappensBefore(op_idx, posterior_op_idx)) {
+      VLOG(7) << "Find dependencies " << prior_op_idx << "->" << op_idx << "->"
+              << posterior_op_idx << ", skip adding " << prior_op_idx << "->"
+              << posterior_op_idx;
+      return;
     }
   }
-
   downstream_ops.insert(posterior_op_idx);
 
-  if (op_happens_before_.size() != 0) {
-    for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) {
-      if (op_happens_before_[op_idx][prior_op_idx]) {
-        op_happens_before_[op_idx][posterior_op_idx] = true;
-      }
+  std::vector<size_t> prior_of_prior = ops_before_[prior_op_idx];
+  std::vector<size_t> posterior_of_posterior = ops_behind_[posterior_op_idx];
 
-      if (op_happens_before_[posterior_op_idx][op_idx]) {
-        op_happens_before_[prior_op_idx][op_idx] = true;
-      }
+  auto update_op_happen_before = [this](size_t prior_op_idx,
+                                        size_t posterior_op_idx) {
+    if (!op_happens_before_[prior_op_idx][posterior_op_idx]) {
+      op_happens_before_[prior_op_idx][posterior_op_idx] = true;
+      ops_before_[posterior_op_idx].push_back(prior_op_idx);
+      ops_behind_[prior_op_idx].push_back(posterior_op_idx);
     }
+  };
+
+  update_op_happen_before(prior_op_idx, posterior_op_idx);
+
+  // All ops before prior-op are also before posterior-op
+  for (size_t op_idx : prior_of_prior) {
+    update_op_happen_before(op_idx, posterior_op_idx);
+  }
+
+  // All ops after posterior-op are also after prior-op
+  for (size_t op_idx : posterior_of_posterior) {
+    update_op_happen_before(prior_op_idx, op_idx);
   }
+
   VLOG(8) << prior_op_idx << "->" << posterior_op_idx;
   VLOG(8) << "Add dependency from "
           << instructions_->at(prior_op_idx).OpBase()->Type() << "("
@@ -468,46 +483,6 @@ void DependencyBuilder::BuildDownstreamMap() {
   }
 }
 
-void DependencyBuilder::BuildOpHappensBefore() {
-  // happens_before[i][j] means i should be executed before j
-  op_happens_before_.assign(op_num_, std::vector<bool>(op_num_, false));
-
-  // bfs to get all next ops
-  auto bfs = [&](size_t op_idx) {
-    std::queue<size_t> q;
-    std::vector<bool> visited(op_num_, false);
-    q.push(op_idx);
-    while (!q.empty()) {
-      size_t op = q.front();
-      q.pop();
-      visited[op] = true;
-      if (!op_downstream_map_.count(op)) {
-        continue;
-      }
-      for (auto next : op_downstream_map_.at(op)) {
-        if (!visited[next]) {
-          PADDLE_ENFORCE_EQ(op_happens_before_[next][op_idx],
-                            false,
-                            paddle::platform::errors::AlreadyExists(
-                                "There exists circle in graph, expected "
-                                "%d->%d, but already got %d->%d",
-                                op_idx,
-                                next,
-                                next,
-                                op_idx));
-          op_happens_before_[op_idx][next] = true;
-          VLOG(10) << "happens before: " << op_idx << " " << next;
-          q.push(next);
-        }
-      }
-    }
-  };
-
-  for (size_t i = 0; i < op_num_; ++i) {
-    bfs(i);
-  }
-}
-
 void DependencyBuilder::ShrinkDownstreamMap() {
   // remove unnecessary downstream ops
   // for example, a->b->c
@@ -529,7 +504,7 @@ void DependencyBuilder::ShrinkDownstreamMap() {
       bool not_after_any = true;
       // find the op that is not executed after any
       for (size_t other_item : op_downstream_map_.at(i)) {
-        if (op_happens_before_[other_item][item]) {
+        if (OpHappensBefore(other_item, item)) {
           VLOG(8) << "happens_before: " << other_item << "->" << item
                   << ", so skip " << item;
           not_after_any = false;
@@ -541,6 +516,8 @@ void DependencyBuilder::ShrinkDownstreamMap() {
         minumum_nexts.insert(item);
       }
     }
+    // NOTE(Ruibiao): op_happens_before will not be changed when shrink
+    // dowstream map
     op_downstream_map_.at(i) = minumum_nexts;
   }
   VLOG(8) << "Finish shrink downstream map";
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
index ec1119e701da3dcdedc6f3cca942f2206723f87c..0c405c52d7abe0355952747a0871f6f5e0bc76b7 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
@@ -40,7 +40,13 @@ class DependencyBuilder {
 
   const std::map<size_t, std::set<size_t>>& OpDownstreamMap() const;
 
-  bool OpHappensBefore(size_t prior_op_idx, size_t posterior_op_idx) const;
+  bool OpHappensBefore(size_t prior_op_idx, size_t posterior_op_idx) const {
+    PADDLE_ENFORCE_GE(
+        op_happens_before_.size(),
+        0,
+        phi::errors::Unavailable("op_happen_before is not yet built"));
+    return op_happens_before_.at(prior_op_idx).at(posterior_op_idx);
+  }
 
  private:
   void AddDependencyForCoalesceTensorOp();
@@ -53,21 +59,27 @@ class DependencyBuilder {
 
   void BuildDownstreamMap();
 
-  void BuildOpHappensBefore();
-
   void ShrinkDownstreamMap();
 
   bool is_build_;
   const std::vector<Instruction>* instructions_;  // not_own
   size_t op_num_;
 
-  // op_happens_before_[i][j] == true means op[i] happens before op[j]
-  std::vector<std::vector<bool>> op_happens_before_;
+  // ops_behind_ is the adjacency list about op to its posterior-ops, that is to
+  // say, op_behind_[i] == {a, b, c} means op[a], op[b] and op[c] depend on
+  // op[i] directly or indirectly. ops_before_ is the revered adjacency list of
+  // ops_behind_.
+  std::vector<std::vector<size_t>> ops_before_;
+  std::vector<std::vector<size_t>> ops_behind_;
 
   // op_downstream_map_ is the mapping from op to its downstream-op set, that is
   // to say, op_downstream_map_[i] == {a, b, c} means op[a], op[b] and op[c]
-  // should be dispatched after op[i]
+  // depend on op[i] directly.
   std::map<size_t, std::set<size_t>> op_downstream_map_;
+
+  // op_happens_before_ is a matrix form of ops_before_ and ops_behind_, it is
+  // used to speed up the query.
+  std::vector<std::vector<bool>> op_happens_before_;
 };
 
 }  // namespace interpreter
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index b3a26781fac811efc767af9744a7eb43ea0b1017..50a4f99f81c0bb4c8f050c5234b45f5ba15fac91 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <unordered_map>
 
+#include "cinn/common/target.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
@@ -75,8 +76,8 @@ class CinnCompiler {
 
   const CinnCompiledObject& Compile(
       const ir::Graph& graph,
-      const std::map<std::string, const phi::DenseTensor*>& input_tensors,
-      const ::cinn::common::Target& target,
+      const std::map<std::string, const phi::DenseTensor*>& input_tensors = {},
+      const ::cinn::common::Target& target = ::cinn::common::DefaultTarget(),
       void* stream = nullptr);
 
   const CinnCompiledObject& Compile(
diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h
index 9fd245ff91765893971558795dcd67d6e63f1533..2ff52e5d078b86f8bca62425352da2fe049f31e0 100644
--- a/paddle/fluid/framework/string_array.h
+++ b/paddle/fluid/framework/string_array.h
@@ -102,6 +102,14 @@ class Vocab : public phi::ExtendedTensor,
 // Kernel. It can be used when you define a non-tensor type that needs to be
 // stored in a vector as PHI kernel argument.
 
+template <typename T>
+struct PhiVectorType;
+
+template <>
+struct PhiVectorType<std::string> {
+  const char* type_name = "PhiVectorString";
+};
+
 template <typename T>
 class PhiVector : public phi::ExtendedTensor,
                   public phi::TypeInfoTraits<phi::TensorBase, PhiVector<T>> {
@@ -129,9 +137,7 @@ class PhiVector : public phi::ExtendedTensor,
  public:
   /// \brief Returns the name of the class for type traits.
   /// \return The name of the class.
-  static const char* name() {
-    return (std::string("PhiVector_") + std::string(typeid(T).name())).c_str();
-  }
+  static const char* name() { return PhiVectorType<T>().type_name; }
 
   size_t size() const { return data_.size(); }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e89bcfa2c6a9909b590d04de2cedb490e97d10d1..bd49153f6b85e98fa74653855971a370b974dc70 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1086,6 +1086,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 }
 
 void AnalysisPredictor::PrepareArgument() {
+  VLOG(3) << "AnalysisPredictor::PrepareArgument";
   // Init std::unique_ptr argument_.
   argument_.reset(new Argument);
   argument_->SetUseGPU(config_.use_gpu());
@@ -2246,10 +2247,12 @@ AnalysisPredictor::~AnalysisPredictor() {
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
+  VLOG(3) << "AnalysisPredictor::Clone";
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->status_is_cloned_ = true;
   x->root_predictor_id_ = this->root_predictor_id_;
+  x->config_.apply_optim_ = false;
   if (config_.use_external_stream_ && stream == nullptr) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "config has been configured to use external stream, but the Clone "
@@ -2461,7 +2464,7 @@ USE_TRT_CONVERTER(rsqrt);
 USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
 USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(preln_skip_layernorm)
-USE_TRT_CONVERTER(preln_residual_bias)
+USE_TRT_CONVERTER(fused_bias_dropout_residual_layer_norm)
 USE_TRT_CONVERTER(c_allreduce_sum)
 USE_TRT_CONVERTER(roll)
 USE_TRT_CONVERTER(strided_slice)
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 9f28343525c129da1f0386c39ae3d5652c69a478..b5582518eacd2103b25e7d78c6bd3d0ffd9abc92 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -136,6 +136,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
 #else
       "elementwise_groupnorm_act_pass",        //
       "preln_elementwise_groupnorm_act_pass",  //
+      "groupnorm_act_pass",                    //
 #endif
       "tensorrt_subgraph_pass",  //
       "conv_bn_fuse_pass",       //
diff --git a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
index 2afc86dfc815d0ba5be988b2bc1ba01863648d6a..4384f7d2b3cb95b04263e814c051c4821b1af1ff 100644
--- a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
@@ -46,6 +46,11 @@ class GroupNormOpConverter : public OpConverter {
     std::string scale_name = op_desc.Input("Scale").front();
     std::string bias_name = op_desc.Input("Bias").front();
 
+    bool with_silu = false;
+    if (op_desc.HasAttr("with_silu")) {
+      with_silu = PADDLE_GET_CONST(bool, op_desc.GetAttr("with_silu"));
+    }
+
     // get the presistable var's data
     auto GetWeight = [&](const std::string& var_name,
                          framework::DDim* dims) -> TensorRTEngine::Weight {
@@ -77,6 +82,7 @@ class GroupNormOpConverter : public OpConverter {
               groups,
               mean_shape,
               variance_shape,
+              with_silu,
               with_fp16);
       nvinfer1::ILayer* groupnorm_layer =
           engine_->AddDynamicPlugin(&input_itensor, 1, plugin);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
index 28847aa5b7a3075b8dcaa6f4b9d476ea0ab213a2..85f9106b0114883ea6db9d23f878c3a4d00a3c0c 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
@@ -26,16 +26,12 @@ class PrelnResidualBiasOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
-    VLOG(4) << "convert fused preln_residual_bias op to tensorrt layer";
-    if (!engine_->with_dynamic_shape()) {
-      PADDLE_THROW(
-          platform::errors::Fatal("Unsupported static graph mode. Please set "
-                                  "dynamic shape of inputs."));
-    }
+    VLOG(4) << "convert fused_bias_dropout_residual_layer_norm op with "
+               "drop_rate = 0 to preln_residual_bias tensorrt layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Residual")[0]);
     std::vector<nvinfer1::ITensor*> inputs;
     inputs.push_back(input1);
     inputs.push_back(input2);
@@ -50,18 +46,18 @@ class PrelnResidualBiasOpConverter : public OpConverter {
       return temp_data;
     };
     framework::DDim bias_dims, scale_dims, ele_bias_dims;
-    auto* bias = get_persistable_data("Bias", &bias_dims);
-    auto* scale = get_persistable_data("Scale", &scale_dims);
+    auto* bias = get_persistable_data("LnBias", &bias_dims);
+    auto* scale = get_persistable_data("LnScale", &scale_dims);
     auto const& vars = op_desc.Inputs(false);
-    bool has_bias = vars.find("EleBias") != vars.end();
+    bool has_bias = vars.find("Bias") != vars.end();
     float* ele_bias =
-        has_bias ? get_persistable_data("EleBias", &ele_bias_dims) : nullptr;
+        has_bias ? get_persistable_data("Bias", &ele_bias_dims) : nullptr;
 
     int bias_size = phi::product(bias_dims);
 
     int scale_size = phi::product(scale_dims);
     int ele_bias_size = has_bias ? phi::product(ele_bias_dims) : 0;
-    float epsilon = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon"));
+    float epsilon = PADDLE_GET_CONST(float, op_desc.GetAttr("ln_epsilon"));
     bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
     if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
       with_fp16 = true;
@@ -102,8 +98,8 @@ class PrelnResidualBiasOpConverter : public OpConverter {
     plugin_inputs.emplace_back(input2);
     layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
     std::vector<std::string> output_names;
-    output_names.push_back(op_desc.Output("Out_0")[0]);
-    output_names.push_back(op_desc.Output("Out_1")[0]);
+    output_names.push_back(op_desc.Output("Y")[0]);
+    output_names.push_back(op_desc.Output("BiasDropoutResidualOut")[0]);
     RreplenishLayerAndOutput(
         layer, "preln_residual_bias", output_names, test_mode);
   }
@@ -113,4 +109,5 @@ class PrelnResidualBiasOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_TRT_OP_CONVERTER(preln_residual_bias, PrelnResidualBiasOpConverter);
+REGISTER_TRT_OP_CONVERTER(fused_bias_dropout_residual_layer_norm,
+                          PrelnResidualBiasOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 0075c64759333ef23e66b599d205947cb47de043..e9c34408bb6bfdbff52d39fd486a5a4af8937e47 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1495,7 +1495,21 @@ struct SimpleOpTypeSetTeller : public Teller {
         return false;
       }
     }
-
+    if (op_type == "fused_bias_dropout_residual_layer_norm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "fused_bias_dropout_residual_layer_norm should run on "
+                   "dynamic shape mode.";
+        return false;
+      }
+      float dropout_rate =
+          PADDLE_GET_CONST(float, desc.GetAttr("dropout_rate"));
+      if (dropout_rate != 0.0f) {
+        VLOG(4) << "preln_residual_bias trt layer can not work with "
+                   "fused_bias_dropout_residual_layer_norm op in which the "
+                   "dropout_rate != 0, stop convert";
+        return false;
+      }
+    }
     if (op_type == "fused_preln_embedding_eltwise_layernorm") {
       if (!with_dynamic_shape) {
         VLOG(3) << "fused_preln_embedding_eltwise_layernorm should run on "
@@ -2594,7 +2608,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "slice",
       "strided_slice",
       "fused_preln_embedding_eltwise_layernorm",
-      "preln_residual_bias",
+      "fused_bias_dropout_residual_layer_norm",
       "c_allreduce_sum",
       "c_allreduce_min",
       "c_allreduce_max",
@@ -2744,7 +2758,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "strided_slice",
       "fused_preln_embedding_eltwise_layernorm",
       "preln_skip_layernorm",
-      "preln_residual_bias",
+      "fused_bias_dropout_residual_layer_norm",
       "c_allreduce_sum",
       "c_allreduce_min",
       "c_allreduce_max",
diff --git a/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h b/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
index 81d507e866a1c2a1fa74b0a06c1af5ed10a00538..915ee1b5e23acd85732a107cb342c28cbc0b54a7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
+++ b/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
@@ -49,8 +49,8 @@ struct GroupNormNHWCParams {
   int32_t c;
   // The number of groups.
   int32_t groups;
-  // Do we apply the Swish activation function?
-  bool withSwish;
+  // Do we apply the Silu activation function?
+  bool withSilu;
 
   // Precomputed values and parameters to control the execution of the kernels.
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
index 77c00d47d4cea3ee9fdf574172ea6d2b4793b076..fc139a9734b30af701ca9cbfeb7cb9e7a1c803ed 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
@@ -247,8 +247,8 @@ __global__ void groupNormNHWCScaleKernel(const GroupNormNHWCParams params) {
     f2.x = gammaF2.x * f2.x + betaF2.x;
     f2.y = gammaF2.y * f2.y + betaF2.y;
 
-    // Apply Swish if needed.
-    if (params.withSwish) {
+    // Apply Silu if needed.
+    if (params.withSilu) {
       f2.x = f2.x * sigmoid(f2.x);
       f2.y = f2.y * sigmoid(f2.y);
     }
@@ -457,7 +457,7 @@ bool GroupNormPluginDynamic::supportsFormatCombination(
   if (pos == 0) {
     if (with_fp16_) {
       return ((in.type == nvinfer1::DataType::kHALF) &&
-              (in.format == nvinfer1::PluginFormat::kLINEAR ||
+              ((!with_silu_ && in.format == nvinfer1::PluginFormat::kLINEAR) ||
                in.format == nvinfer1::PluginFormat::kHWC8));
     } else {
       return (in.type == nvinfer1::DataType::kFLOAT) &&
@@ -624,7 +624,7 @@ int GroupNormPluginDynamic::enqueue(
         cPerBlock = 8;
       }
 
-      params_.withSwish = false;
+      params_.withSilu = with_silu_;
       params_.dst = static_cast<half *>(outputs[0]);
       params_.srcX = static_cast<half const *>(inputs[0]);
       params_.gamma = scale_gpu_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
index 1fa505c077ea81ffe1dcdded3363d21504f2c499..3feb35e0708bc6721824a12faf63abe83734084b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
@@ -164,11 +164,13 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
                          int groups,
                          std::vector<int64_t> mean_shape,
                          std::vector<int64_t> variance_shape,
+                         bool with_silu,
                          bool with_fp16)
       : groups_(groups),
         eps_(eps),
         mean_shape_(mean_shape),
         variance_shape_(variance_shape),
+        with_silu_(with_silu),
         with_fp16_(with_fp16) {
     scale_.resize(scale_num);
     bias_.resize(bias_num);
@@ -183,6 +185,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &groups_);
     DeserializeValue(&serialData, &serialLength, &mean_shape_);
     DeserializeValue(&serialData, &serialLength, &variance_shape_);
+    DeserializeValue(&serialData, &serialLength, &with_silu_);
     DeserializeValue(&serialData, &serialLength, &with_fp16_);
   }
   nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
@@ -194,6 +197,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
                                            groups_,
                                            mean_shape_,
                                            variance_shape_,
+                                           with_silu_,
                                            with_fp16_);
     ptr->scale_gpu_ = scale_gpu_;
     ptr->bias_gpu_ = bias_gpu_;
@@ -210,7 +214,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
     return SerializedSize(scale_) + SerializedSize(bias_) +
            SerializedSize(eps_) + SerializedSize(groups_) +
            SerializedSize(mean_shape_) + SerializedSize(variance_shape_) +
-           SerializedSize(with_fp16_);
+           SerializedSize(with_silu_) + SerializedSize(with_fp16_);
   }
   void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, scale_);
@@ -219,6 +223,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
     SerializeValue(&buffer, groups_);
     SerializeValue(&buffer, mean_shape_);
     SerializeValue(&buffer, variance_shape_);
+    SerializeValue(&buffer, with_silu_);
     SerializeValue(&buffer, with_fp16_);
   }
   nvinfer1::DimsExprs getOutputDimensions(
@@ -277,6 +282,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
   std::vector<int64_t> mean_shape_;
   std::vector<int64_t> variance_shape_;
   GroupNormNHWCParams params_;
+  bool with_silu_;
   bool with_fp16_;
 };
 class GroupNormPluginDynamicCreator : public TensorRTPluginCreator {
diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
index a756a826bfb1554588dc8ebf38734fc20b367b01..d3ca36770a4d22d34267e5179700a8a180fa9dcf 100644
--- a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
@@ -330,8 +330,8 @@ __global__ void prelnGroupNormNHWCScaleKernel(GroupNormNHWCParams params) {
     f2.x = gammaF2.x * f2.x + betaF2.x;
     f2.y = gammaF2.y * f2.y + betaF2.y;
 
-    // Apply Swish if needed.
-    if (params.withSwish) {
+    // Apply Silu if needed.
+    if (params.withSilu) {
       f2.x = f2.x * sigmoid(f2.x);
       f2.y = f2.y * sigmoid(f2.y);
     }
@@ -431,7 +431,7 @@ int PrelnGroupnormActPluginDynamic::enqueue(
     if (cPerBlock > input_desc[0].dims.d[1]) {
       cPerBlock = 8;
     }
-    params_.withSwish = with_silu_;
+    params_.withSilu = with_silu_;
     params_.dst = static_cast<half *>(outputs[1]);
     params_.eleOut = static_cast<half *>(outputs[0]);
     params_.srcX = static_cast<half const *>(inputs[0]);
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
index adba9324472a29e3804d335276f27c54151328ff..997205e9189366d5ee2798c96b817ac22002734b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
@@ -340,8 +340,8 @@ __global__ void skipGroupNormNHWCScaleKernel(GroupNormNHWCParams params) {
     f2.x = gammaF2.x * f2.x + betaF2.x;
     f2.y = gammaF2.y * f2.y + betaF2.y;
 
-    // Apply Swish if needed.
-    if (params.withSwish) {
+    // Apply Silu if needed.
+    if (params.withSilu) {
       f2.x = f2.x * sigmoid(f2.x);
       f2.y = f2.y * sigmoid(f2.y);
     }
@@ -439,7 +439,7 @@ int SkipGroupnormActPluginDynamic::enqueue(
     if (cPerBlock > input_desc[0].dims.d[1]) {
       cPerBlock = 8;
     }
-    params_.withSwish = true;
+    params_.withSilu = true;
     params_.dst = static_cast<half *>(outputs[0]);
     params_.srcX = static_cast<half const *>(inputs[0]);
     params_.srcY = static_cast<half const *>(inputs[1]);
diff --git a/paddle/fluid/jit/compilation_unit.cc b/paddle/fluid/jit/compilation_unit.cc
index 0f241d864fe07ecd6998f187b0661c5c3a3f7868..1a2351048f90a65e76f2e76ad4573001f486b9a3 100644
--- a/paddle/fluid/jit/compilation_unit.cc
+++ b/paddle/fluid/jit/compilation_unit.cc
@@ -38,5 +38,13 @@ void CompilationUnit::SetEngine(const std::string &name,
 
 const jit::EngineMap &CompilationUnit::EngineMap() const { return engine_map_; }
 
+std::shared_ptr<CompilationUnit> CompilationUnit::Clone(void *stream) {
+  auto x = std::make_shared<CompilationUnit>();
+  for (auto &it : engine_map_) {
+    x->SetEngine(it.first, std::move(it.second->Clone(stream)));
+  }
+  return x;
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/compilation_unit.h b/paddle/fluid/jit/compilation_unit.h
index b862faa23f9785bfd839cc563afd1d5c61e5356d..25e725fe57b9ee8203f0f7f1756877b665764b71 100644
--- a/paddle/fluid/jit/compilation_unit.h
+++ b/paddle/fluid/jit/compilation_unit.h
@@ -36,6 +36,8 @@ class CompilationUnit {
 
   const jit::EngineMap &EngineMap() const;
 
+  std::shared_ptr<CompilationUnit> Clone(void *stream = nullptr);
+
  private:
   jit::EngineMap engine_map_;
 };
diff --git a/paddle/fluid/jit/engine/base_engine.h b/paddle/fluid/jit/engine/base_engine.h
index eaf3c1221c8a20243b3fea810e2a2ecc4628d0d6..b6571d7ebdd41107ca339093453701fcdadb8139 100644
--- a/paddle/fluid/jit/engine/base_engine.h
+++ b/paddle/fluid/jit/engine/base_engine.h
@@ -29,6 +29,8 @@ class BaseEngine {
 
   virtual std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) = 0;
 
+  virtual std::unique_ptr<BaseEngine> Clone(void *stream = nullptr) = 0;
+
   virtual ~BaseEngine() {}
 };
 
diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc
index 410fd4dc01bed1f34d446bfe8ef020e0d2d21d7c..b16d0c98dee81b94e0ce2ded1a6086856c8248af 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.cc
+++ b/paddle/fluid/jit/engine/interpreter_engine.cc
@@ -25,17 +25,18 @@
 namespace paddle {
 namespace jit {
 
-InterpreterEngine::InterpreterEngine(const std::shared_ptr<FunctionInfo> &info,
-                                     const VariableMap &params_dict,
-                                     const phi::Place &place)
-    : info_(info), place_(place) {
+InterpreterEngine::InterpreterEngine(
+    const std::shared_ptr<FunctionInfo> &info,
+    const std::shared_ptr<VariableMap> &params_dict,
+    const phi::Place &place)
+    : info_(info), params_dict_(params_dict), place_(place) {
   info_->RemoveDescFeedFetch();
   PADDLE_ENFORCE_GT(
       static_cast<int64_t>(info_->ProgramDesc().Block(0).OpSize()),
       0,
       platform::errors::PreconditionNotMet(
           "There is no operator in ProgramDesc."));
-  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_);
+  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict_, &scope_);
   VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
   CreateInterpreterCore();
 }
@@ -98,5 +99,10 @@ const std::shared_ptr<FunctionInfo> &InterpreterEngine::Info() const {
   return info_;
 }
 
+std::unique_ptr<BaseEngine> InterpreterEngine::Clone(void *stream) {
+  auto *x = new InterpreterEngine(info_, params_dict_, place_);
+  return std::unique_ptr<BaseEngine>(x);
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/engine/interpreter_engine.h b/paddle/fluid/jit/engine/interpreter_engine.h
index 8c7f43f297d224a8de24d0215041a613c394485b..367bc1b86dcc6e8df5348c6ca8c1aecc12abe573 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.h
+++ b/paddle/fluid/jit/engine/interpreter_engine.h
@@ -36,21 +36,25 @@ using InterpreterCore = framework::InterpreterCore;
 class InterpreterEngine : public BaseEngine {
  public:
   InterpreterEngine(const std::shared_ptr<FunctionInfo> &info,
-                    const VariableMap &params_dict,
+                    const std::shared_ptr<VariableMap> &params_dict,
                     const phi::Place &place);
 
   ~InterpreterEngine() noexcept {}
 
   void CreateInterpreterCore();
 
-  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs);
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) override;
 
-  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs);
+  std::vector<DenseTensor> operator()(
+      const std::vector<DenseTensor> &inputs) override;
 
   const std::shared_ptr<FunctionInfo> &Info() const;
 
+  std::unique_ptr<BaseEngine> Clone(void *stream = nullptr) override;
+
  private:
   std::shared_ptr<FunctionInfo> info_;
+  std::shared_ptr<VariableMap> params_dict_;
   framework::Scope scope_;
   phi::Place place_;
   std::shared_ptr<framework::InterpreterCore> inner_interpreter_;
diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
index 6a44c192c16f729224810ca945d4888ef8491aab..d18f4f487dbe2ebf302e0357777649077ba83d6a 100644
--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -27,11 +27,15 @@ static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                                       DenseTensor *t,
                                       const platform::Place &place);
 
-PredictorEngine::PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
-                                 const VariableMap &params_dict,
-                                 const phi::Place &place)
-    : info_(info), scope_(new framework::Scope()), place_(place) {
-  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, scope_.get());
+PredictorEngine::PredictorEngine(
+    const std::shared_ptr<FunctionInfo> &info,
+    const std::shared_ptr<VariableMap> &params_dict,
+    const phi::Place &place)
+    : info_(info),
+      params_dict_(params_dict),
+      scope_(new framework::Scope()),
+      place_(place) {
+  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict_, scope_.get());
   VLOG(6) << framework::GenScopeTreeDebugInfo(scope_.get());
 
   // TODO(Aurelius84): Expose AnalysisConfig to user.
@@ -55,6 +59,23 @@ PredictorEngine::PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
       scope_, std::make_shared<framework::ProgramDesc>(info_->ProgramDesc()));
 }
 
+PredictorEngine::PredictorEngine(
+    const std::shared_ptr<FunctionInfo> &info,
+    const std::shared_ptr<framework::Scope> &scope,
+    const phi::Place &place,
+    const std::shared_ptr<PaddlePredictor> &predictor)
+    : info_(info),
+      scope_(scope),
+      place_(place),
+      predictor_(std::dynamic_pointer_cast<AnalysisPredictor, PaddlePredictor>(
+          predictor)) {}
+
+std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
+  auto *x = new PredictorEngine(
+      info_, scope_, place_, std::move(predictor_->Clone(stream)));
+  return std::unique_ptr<BaseEngine>(x);
+}
+
 std::vector<Tensor> PredictorEngine::operator()(
     const std::vector<Tensor> &inputs) {
   auto dense_tensors = utils::ToDenseTensors(inputs);
diff --git a/paddle/fluid/jit/engine/predictor_engine.h b/paddle/fluid/jit/engine/predictor_engine.h
index 026b012cbfb02fd0afc4cb75d4162246f4d7c6c6..b2da6f4210a37a7223e1d8cf07a5cd2cf38f4184 100644
--- a/paddle/fluid/jit/engine/predictor_engine.h
+++ b/paddle/fluid/jit/engine/predictor_engine.h
@@ -20,6 +20,7 @@
 
 namespace paddle {
 class AnalysisPredictor;
+class PaddlePredictor;
 
 namespace framework {
 class Scope;
@@ -30,17 +31,26 @@ namespace jit {
 class PredictorEngine : public BaseEngine {
  public:
   PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
-                  const VariableMap &params_dict,
+                  const std::shared_ptr<VariableMap> &params_dict,
                   const phi::Place &place);
 
+  PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
+                  const std::shared_ptr<framework::Scope> &scope,
+                  const phi::Place &place,
+                  const std::shared_ptr<PaddlePredictor> &predictor);
+
   ~PredictorEngine() noexcept {}
 
-  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs);
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) override;
+
+  std::vector<DenseTensor> operator()(
+      const std::vector<DenseTensor> &inputs) override;
 
-  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs);
+  std::unique_ptr<BaseEngine> Clone(void *stream = nullptr) override;
 
  private:
   std::shared_ptr<FunctionInfo> info_;
+  std::shared_ptr<VariableMap> params_dict_;
   std::shared_ptr<framework::Scope> scope_;
   phi::Place place_;
   std::shared_ptr<AnalysisPredictor> predictor_;
diff --git a/paddle/fluid/jit/function_utils.cc b/paddle/fluid/jit/function_utils.cc
index b67b5ba5b0518d8d7e6367ebaf7f155326c34c45..3bd8c234113157276338ac0d1ed89cfa3cfae687 100644
--- a/paddle/fluid/jit/function_utils.cc
+++ b/paddle/fluid/jit/function_utils.cc
@@ -71,18 +71,18 @@ void ShareIntoScope(const std::vector<std::string> &ordered_input_names,
 }
 
 void ShareParamsIntoScope(const std::vector<std::string> &param_names,
-                          const VariableMap &params_dict,
+                          const std::shared_ptr<VariableMap> &params_dict,
                           framework::Scope *scope) {
   for (size_t i = 0; i < param_names.size(); ++i) {
     std::string name = param_names[i];
-    PADDLE_ENFORCE_EQ(params_dict.count(name),
+    PADDLE_ENFORCE_EQ(params_dict->count(name),
                       1,
                       phi::errors::InvalidArgument(
                           "Parameter named %s is not existed in params_dict. "
                           "Please check that your model was saved correctly",
                           name));
 
-    auto &param = params_dict.find(name)->second;
+    auto &param = params_dict->find(name)->second;
     auto &dense_tensor = param->Get<DenseTensor>();
     auto *var = scope->Var(name);
     auto *dst_tensor = var->GetMutable<DenseTensor>();
diff --git a/paddle/fluid/jit/function_utils.h b/paddle/fluid/jit/function_utils.h
index d61b720cec88fd1ea8877b4bcc8eb4c2757f1894..5daa5ada200f48adf595d5b62218fa305fcfd95d 100644
--- a/paddle/fluid/jit/function_utils.h
+++ b/paddle/fluid/jit/function_utils.h
@@ -51,14 +51,14 @@ void ShareIntoScope(const std::vector<std::string> &ordered_input_names,
                     framework::Scope *scope);
 
 void ShareParamsIntoScope(const std::vector<std::string> &param_names,
-                          const VariableMap &params_dict,
+                          const std::shared_ptr<VariableMap> &params_dict,
                           framework::Scope *scope);
 
 void RemoveFeedFetch(framework::ProgramDesc *program_desc);
 
 template <typename T>
 std::shared_ptr<T> MakeEngine(const std::shared_ptr<FunctionInfo> &info,
-                              const VariableMap &params_dict,
+                              const std::shared_ptr<VariableMap> &params_dict,
                               const phi::Place &place) {
   return std::make_shared<T>(info, params_dict, place);
 }
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index 75a7e282e6be8020514d22ef62648eda52a786b0..2e8dba0f5a731b9e4e6ace901e08accba58ec481 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -26,11 +26,14 @@
 namespace paddle {
 namespace jit {
 
-Layer::Layer(const VariableMap& params_map,
-             const VariableMap& attrs_map,
+Layer::Layer(const std::shared_ptr<VariableMap>& params_map,
+             const std::shared_ptr<VariableMap>& attrs_map,
              const FunctionInfoMap& info_map,
              const phi::Place& place)
-    : params_map_(params_map), attrs_map_(attrs_map), info_map_(info_map) {
+    : params_map_(params_map),
+      attrs_map_(attrs_map),
+      info_map_(info_map),
+      place_(place) {
   unit_.reset(new CompilationUnit());
 }
 
@@ -77,12 +80,12 @@ std::vector<std::string> Layer::FunctionNames() const {
 #define PD_SPECIALZE_ATTRIBUTE_TYPE(T)                                \
   template <>                                                         \
   T Layer::Attribute<T>(const std::string& name) const {              \
-    if (attrs_map_.find(name) == attrs_map_.end()) {                  \
+    if (attrs_map_->find(name) == attrs_map_->end()) {                \
       PADDLE_THROW(phi::errors::NotFound(                             \
           "Attribute can not found %s, please check if it exists.")); \
       return T();                                                     \
     }                                                                 \
-    auto var = attrs_map_.at(name);                                   \
+    auto var = attrs_map_->at(name);                                  \
     T ret = var->Get<T>();                                            \
     return ret;                                                       \
   }
@@ -94,5 +97,12 @@ PD_SPECIALZE_ATTRIBUTE_TYPE(std::vector<int>)
 PD_SPECIALZE_ATTRIBUTE_TYPE(std::vector<float>)
 PD_SPECIALZE_ATTRIBUTE_TYPE(std::vector<std::string>)
 
+std::shared_ptr<Layer> Layer::Clone(void* stream) {
+  std::shared_ptr<Layer> x =
+      std::make_shared<Layer>(params_map_, attrs_map_, info_map_, place_);
+  x->unit_ = unit_->Clone(stream);
+  return x;
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h
index dd5ff5d9f91cd9fa27bd03ae220b201b7982c407..4f76a41d06f3ea3d2d26aea30fb3ed8db6791c93 100644
--- a/paddle/fluid/jit/layer.h
+++ b/paddle/fluid/jit/layer.h
@@ -43,8 +43,8 @@ using FunctionInfoMap =
 
 class Layer {
  public:
-  Layer(const VariableMap& params_map,
-        const VariableMap& attrs_map_,
+  Layer(const std::shared_ptr<VariableMap>& params_map,
+        const std::shared_ptr<VariableMap>& attrs_map_,
         const FunctionInfoMap& info_map,
         const phi::Place& place);
 
@@ -67,10 +67,13 @@ class Layer {
 
   std::vector<std::string> FunctionNames() const;
 
+  std::shared_ptr<Layer> Clone(void* stream = nullptr);
+
  private:
-  VariableMap params_map_;
-  VariableMap attrs_map_;
+  std::shared_ptr<VariableMap> params_map_;
+  std::shared_ptr<VariableMap> attrs_map_;
   FunctionInfoMap info_map_;
+  phi::Place place_;
   std::shared_ptr<CompilationUnit> unit_;
 };
 
diff --git a/paddle/fluid/jit/layer_test.cc b/paddle/fluid/jit/layer_test.cc
index 4e367d8cc1b510e5723f5c634b6b5b2486e85595..c163f3c50d9dd3a0eab0baa90711a22e484c7f2b 100644
--- a/paddle/fluid/jit/layer_test.cc
+++ b/paddle/fluid/jit/layer_test.cc
@@ -20,6 +20,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -78,7 +79,11 @@ TEST(CpuLayerTest, Function) {
 TEST(CpuLayerTest, Construct) {
   auto place = phi::CPUPlace();
   std::string path = "./multi_program_load/export";
+  paddle::platform::Timer timer;
+  timer.Start();
   auto layer = jit::Load(path, place);
+  timer.Pause();
+  std::cout << "jit::Load coast" << timer.ElapsedMS() << std::endl;
 
   float fbias = layer.Attribute<float>("fbias");
   EXPECT_FLOAT_EQ(fbias, 1.4);
@@ -119,6 +124,41 @@ TEST(CpuLayerTest, Construct) {
   EXPECT_NEAR(out_data[0], pow(1.41562390, 2.0), 1e-6);
 }
 
+TEST(CpuLayerTest, Clone) {
+  auto place = phi::CPUPlace();
+  std::string path = "./multi_program_load/export";
+
+  paddle::platform::Timer timer;
+  timer.Start();
+  auto layer = jit::Load(path, place);
+  timer.Pause();
+  std::cout << "jit::Load cost " << timer.ElapsedMS() << " ms" << std::endl;
+
+  timer.Start();
+  auto layer2 = layer.Clone();
+  timer.Pause();
+  std::cout << "jit::Layer::Clone cost " << timer.ElapsedMS() << " ms"
+            << std::endl;
+
+  float fbias = layer2->Attribute<float>("fbias");
+  EXPECT_FLOAT_EQ(fbias, 1.4);
+
+  auto inputs = PrepareInputs(place);
+  auto outs = layer2->forward(inputs);
+  auto out_data = outs[0].data<float>();
+  EXPECT_NEAR(out_data[0], 0.02194316, 1e-6);
+
+  auto func = layer2->Function("infer");
+  EXPECT_TRUE(func.IsValid());
+  outs = func(inputs);
+  out_data = outs[0].data<float>();
+  EXPECT_NEAR(out_data[0], 1.41562390, 1e-6);
+  auto pow_out =
+      paddle::experimental::pow(outs[0], paddle::experimental::Scalar(2));
+  out_data = pow_out.data<float>();
+  EXPECT_NEAR(out_data[0], pow(1.41562390, 2.0), 1e-6);
+}
+
 #if defined(PADDLE_WITH_CUDA)
 TEST(GpuLayerTest, Construct) {
   auto place = phi::GPUPlace();
@@ -147,6 +187,22 @@ TEST(GpuLayerTest, Construct) {
   out_data = cpu_tensor.data<float>();
   EXPECT_NEAR(out_data[0], sqrt(1.41562390), 1e-6);
 }
+
+TEST(GpuLayerTest, Clone) {
+  auto place = phi::GPUPlace();
+
+  std::string path = "./multi_program_load/export";
+  auto layer = jit::Load(path, place);
+  auto inputs = PrepareInputs(place);
+
+  auto layer2 = layer.Clone();
+  auto outs = layer2->forward(inputs);
+  auto gpu_tensor = outs[0];
+  auto cpu_tensor =
+      paddle::experimental::copy_to(gpu_tensor, phi::CPUPlace(), true);
+  auto out_data = cpu_tensor.data<float>();
+  EXPECT_NEAR(out_data[0], 0.02194316, 1e-6);
+}
 #endif
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc
index 0a7fdc0e3525a6a3180cae05749d2934f41abbd7..21a187ad6710052d4d5c5085616bf34fba45db0c 100644
--- a/paddle/fluid/jit/serializer.cc
+++ b/paddle/fluid/jit/serializer.cc
@@ -30,8 +30,10 @@ DECLARE_string(jit_engine_type);
 
 namespace paddle {
 namespace jit {
+
 using FunctionInfoMap =
     std::unordered_map<std::string, std::shared_ptr<FunctionInfo>>;
+
 Layer Deserializer::operator()(const std::string& path,
                                const phi::Place& place) {
   const auto& pdmodel_paths = utils::PdmodelFilePaths(path);
@@ -56,12 +58,12 @@ Layer Deserializer::operator()(const std::string& path,
     info_map[func_name]->SetProgramFilePath(it.second);
   }
 
-  VariableMap params_dict;
-  VariableMap attrs_dict;
-  ReadTensorData(path + PDPARAMS_SUFFIX, param_names_set, place, &params_dict);
+  auto params_dict = std::make_shared<VariableMap>();
+  auto attrs_dict = std::make_shared<VariableMap>();
+  ReadTensorData(path + PDPARAMS_SUFFIX, param_names_set, place, params_dict);
 
   if (utils::FileExists(path + PROPERTY_SUFFIX)) {
-    ReadAttributeData(path + PROPERTY_SUFFIX, &attrs_dict);
+    ReadAttributeData(path + PROPERTY_SUFFIX, attrs_dict);
     VLOG(3) << "Read Property Success!";
   }
 
@@ -88,10 +90,11 @@ Layer Deserializer::operator()(const std::string& path,
   return layer;
 }
 
-void Deserializer::ReadTensorData(const std::string& file_name,
-                                  const std::set<std::string>& var_name,
-                                  const phi::Place& place,
-                                  VariableMap* params_dict) const {
+void Deserializer::ReadTensorData(
+    const std::string& file_name,
+    const std::set<std::string>& var_name,
+    const phi::Place& place,
+    std::shared_ptr<VariableMap> params_dict) const {
   VLOG(3) << "ReadTensorData from: " << file_name;
   std::ifstream fin(file_name, std::ios::binary);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -106,12 +109,15 @@ void Deserializer::ReadTensorData(const std::string& file_name,
   }
 }
 
-void Deserializer::ReadAttributeData(const std::string& file_path,
-                                     VariableMap* attrs_dict) const {
+void Deserializer::ReadAttributeData(
+    const std::string& file_path,
+    std::shared_ptr<VariableMap> attrs_dict) const {
   VLOG(3) << "ReadPropertyData from: " << file_path;
   Property p;
   p.Deserialization(file_path);
-  *attrs_dict = static_cast<VariableMap>(p.Values());
+  for (auto& it : p.Values()) {
+    attrs_dict->emplace(it.first, it.second);
+  }
   return;
 }
 
diff --git a/paddle/fluid/jit/serializer.h b/paddle/fluid/jit/serializer.h
index b93eaa44fe63268b13bfd86b66e998ac77da3392..926e9a6afda3718626c6629664867d01c72afa92 100644
--- a/paddle/fluid/jit/serializer.h
+++ b/paddle/fluid/jit/serializer.h
@@ -55,11 +55,11 @@ class Deserializer {
   void ReadTensorData(const std::string& file_name,
                       const std::set<std::string>& var_name,
                       const phi::Place& place,
-                      VariableMap* params_dict) const;
+                      std::shared_ptr<VariableMap> params_dict) const;
 
   // property pb
   void ReadAttributeData(const std::string& file_path,
-                         VariableMap* attrs_dict) const;
+                         std::shared_ptr<VariableMap> attrs_dict) const;
 
   // void ReadExtraInfo(const std::string& file_name) const;
 
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 9d895edc96538c315eca9167fb63fca56aa95522..77c8554ad39abb55d5a1e21dbca63cf8309f027f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -177,26 +177,6 @@ $$out = \min(\max(0, x), threshold)$$
   }
 };
 
-class PowOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of Pow operator");
-    AddInput("FactorTensor",
-             "(Tensor<float>, optional). If provided, pow will use this"
-             "The shape of FactorTensor MUST BE [1]."
-             "it has higher priority than attr(factor).")
-        .AsDispensable();
-    AddOutput("Out", "Output of Pow operator");
-    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
-    AddComment(R"DOC(
-Pow Activation Operator.
-
-$$out = x^{factor}$$
-
-)DOC");
-  }
-};
-
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -403,138 +383,6 @@ DECLARE_INPLACE_OP_INFERER(ActivationDoubleGradOpInplaceInferer,
 DECLARE_INPLACE_OP_INFERER(ActivationTripleGradOpInplaceInferer,
                            {"DDX", "D_DOut"});
 
-template <typename T>
-class PowGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("pow_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework ::GradVarName("X"), this->InputGrad("X"));
-    op->SetInput("FactorTensor", this->Input("FactorTensor"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-template <typename T>
-class PowDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("pow_double_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
-    op->SetInput("DDX", this->OutputGrad(framework ::GradVarName("X")));
-    op->SetOutput("DX", this->InputGrad("X"));
-    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
-    op->SetInput("FactorTensor", this->Input("FactorTensor"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-template <typename T>
-class PowTripleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("pow_triple_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("DOut", this->Input("DOut"));
-    op->SetInput("DDX", this->Input("DDX"));
-    op->SetInput("D_DX", this->OutputGrad("DX"));
-    op->SetInput("D_DDOut", this->OutputGrad("DDOut"));
-    op->SetOutput("D_X", this->InputGrad("X"));
-    op->SetOutput("D_DOut", this->InputGrad("DOut"));
-    op->SetOutput("D_DDX", this->InputGrad("DDX"));
-    op->SetInput("FactorTensor", this->Input("FactorTensor"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-class PowOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "X");
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    if (var_name == "FactorTensor") {
-      return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                            expected_kernel_type.layout(),
-                            expected_kernel_type.dtype());
-    }
-    return phi::KernelKey(
-        tensor.place(), tensor.layout(), expected_kernel_type.dtype());
-  }
-};
-
-class PowOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto out_grad_name = framework::GradVarName("Out");
-    ctx->ShareDim(out_grad_name, framework::GradVarName("X"));
-    ctx->ShareLoD(out_grad_name, framework::GradVarName("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, framework::GradVarName("Out"));
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    if (var_name == "FactorTensor") {
-      return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                            expected_kernel_type.layout(),
-                            expected_kernel_type.dtype());
-    }
-    return phi::KernelKey(
-        tensor.place(), tensor.layout(), expected_kernel_type.dtype());
-  }
-};
-
-class PowOpDoubleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "X");
-  }
-};
-
-class PowOpTripleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "X");
-  }
-};
 DECLARE_INPLACE_OP_INFERER(ActFwdInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
@@ -582,40 +430,6 @@ REGISTER_ACTIVATION_OP(hard_swish,
                        HardSwishGradFunctor);
 REGISTER_ACTIVATION_OP(swish, Swish, SwishFunctor, SwishGradFunctor);
 
-/* ==========================   pow register  ============================ */
-DECLARE_INFER_SHAPE_FUNCTOR(pow_double_grad,
-                            PowDoubleGradInferShapeFunctor,
-                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
-DECLARE_INFER_SHAPE_FUNCTOR(pow_triple_grad,
-                            PowTripleGradInferShapeFunctor,
-                            PD_INFER_META(phi::GeneralTernaryGradInferMeta));
-
-REGISTER_OPERATOR(
-    pow,
-    ops::PowOp,
-    ops::PowOpMaker,
-    ops::ActivationOpInferVarType,
-    ops::PowGradOpMaker<paddle::framework::OpDesc>,
-    ops::PowGradOpMaker<paddle::imperative::OpBase>,
-    std::conditional<ops::CanInplaceAct<ops::PowGradFunctor<float>>(),
-                     ops::ActFwdInplaceInferer,
-                     void>::type);
-REGISTER_OPERATOR(pow_grad,
-                  ops::PowOpGrad,
-                  ops::ActivationGradOpInplaceInferer,
-                  ops::PowDoubleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PowDoubleGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(pow_double_grad,
-                  ops::PowOpDoubleGrad,
-                  ops::ActivationDoubleGradOpInplaceInferer,
-                  ops::PowTripleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PowTripleGradOpMaker<paddle::imperative::OpBase>,
-                  PowDoubleGradInferShapeFunctor);
-REGISTER_OPERATOR(pow_triple_grad,
-                  ops::PowOpTripleGrad,
-                  PowTripleGradInferShapeFunctor);
-/* ========================================================================== */
-
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(leaky_relu)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
deleted file mode 100644
index 6d924644192c9526cebbda27fc7423c778474c5e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-using framework::DDim;
-
-class BroadcastTensorsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // Broadcast semantics enforces all input variables having the same
-    // DataType/VarType
-    // This condition is also checked during VarType Inference
-    // Here we simply copy input type to output
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "A Varaible list. The shape and data type of the list elements"
-        "should be consistent. Variable can be multi-dimensional Tensor"
-        "or phi::DenseTensor, and data types can be: bool, float16, float32, "
-        "float64, int32, "
-        "int64.")
-        .AsDuplicable();
-    AddOutput("Out",
-              "the sum of input :code:`x`. its shape and data types are "
-              "consistent with :code:`x`.")
-        .AsDuplicable();
-    AddComment(
-        R"DOC(This OP is used to broadcast a vector of inputs
-                     with phi::DenseTensor type, following broadcast semantics.)DOC");
-  }
-};
-
-class BroadcastTensorsOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    // We need at least two tensors to satisfy broadcast semantics
-    size_t input_size = ctx->InputSize("X");
-    PADDLE_ENFORCE_GT(
-        input_size,
-        0,
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp should have at least one input variables,"
-            "but only received %d ",
-            input_size));
-
-    // BroadcastTensorsOp takes a vector of variables named "X"
-    // Here we loop through input variables,
-    // and check if their DataType/VarType are the same
-    auto var_type = ctx->GetInputType("X", 0);
-    auto data_type = ctx->GetInputDataType("X", 0);
-    for (size_t ind = 1; ind < input_size; ind++) {
-      auto cur_var_type = ctx->GetInputType("X", ind);
-      PADDLE_ENFORCE_EQ(
-          var_type,
-          cur_var_type,
-          platform::errors::InvalidArgument(
-              "inputs to BroadcastTensorsOp should have the same variable type,"
-              "but detected %d v.s %d ",
-              framework::ToTypeName(var_type),
-              framework::ToTypeName(cur_var_type)));
-
-      auto cur_data_type = ctx->GetInputDataType("X", ind);
-      PADDLE_ENFORCE_EQ(
-          data_type,
-          cur_data_type,
-          platform::errors::InvalidArgument(
-              "inputs to BroadcastTensorsOp should have the same data type,"
-              "but detected %d v.s %d ",
-              framework::ToTypeName(var_type),
-              framework::ToTypeName(cur_var_type)));
-    }
-
-    // Outputs having the same DataType/VarType as inputs
-    ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
-    ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
-  }
-};
-
-/* ------ BroadcastTensorsGradOp ------ */
-class BroadcastTensorsGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")),
-                   "Output",
-                   "X@grad",
-                   "broadcast_tensors");
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
-    OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@grad",
-                   "broadcast_tensors");
-
-    const auto& forward_input_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim(framework::GradVarName("X"), forward_input_dims);
-    ctx->ShareAllLoD("X", /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class BroadcastTensorsGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("broadcast_tensors_grad");
-    // We need "X" only for backward shape inference
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"),
-                       this->InputGrad("X", /* drop_empty_grad */ false));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class BroadcastTensorsGradOpVarTypeInference
-    : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto var_type = ctx->GetInputType("X", 0);
-    auto data_type = ctx->GetInputDataType("X", 0);
-
-    ctx->SetOutputType(
-        framework::GradVarName("X"), var_type, framework::ALL_ELEMENTS);
-    ctx->SetOutputDataType(
-        framework::GradVarName("X"), data_type, framework::ALL_ELEMENTS);
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
-                                    "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
-                            BroadcastTensorsInferShapeFunctor,
-                            PD_INFER_META(phi::BroadcastTensorsInferMeta));
-
-REGISTER_OPERATOR(broadcast_tensors,
-                  ops::BroadcastTensorsOp,
-                  ops::BroadcastTensorsOpMaker,
-                  ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
-                  ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
-                  ops::BroadcastTensorsOpVarTypeInference,
-                  BroadcastTensorsInferShapeFunctor);
-
-REGISTER_OPERATOR(broadcast_tensors_grad,
-                  ops::BroadcastTensorsGradOp,
-                  ops::BroadcastTensorsGradOpVarTypeInference,
-                  ops::BroadcastTensorsGradNoNeedBufVarsInferer);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index af429e0f01e336c65b0183a08ef4acfb319006c8..0b999ccab016f24431a2511c4d44ad8d055f5e02 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -119,12 +119,16 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
   // collect variables name list to be skipped in GC
   skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size());
   auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) {
+    // Always consider Input/Output of Graph as skip_gc_vars, because
+    // InterpreterCore has no eager_deletion_op to deal with it.
+
+    VLOG(4) << "Append a skip_gc_var for InterpreterCore:" << var_name;
+    skip_gc_vars_.insert(var_name);
     // if a var exists at the outer_varinfo map, that means it will be
     // erased by the following eager_deletion_op of current cinn_launch op
     if (!outer_varinfo.count(var_name)) {
       skip_eager_vars_.emplace_back(var_name);
-      skip_gc_vars_.insert(var_name);
-      VLOG(4) << "Append a skip_gc_var:" << var_name;
+      VLOG(4) << "Append a skip_gc_var for PE:" << var_name;
     }
   };
   std::for_each(
diff --git a/paddle/fluid/operators/compat/fused_matmul.pbtxt b/paddle/fluid/operators/compat/fused_matmul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a858da2e72eeddcaea1619bec7bd4b3159d2fea
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_matmul.pbtxt
@@ -0,0 +1,93 @@
+type: "fused_matmul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  inputs {
+    name: "ResidualData"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "trans_x"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trans_y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "matmul_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_output_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_reshape_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_Y"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Y"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Out"
+    type: INTS
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_in_eltwise"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/compat/matmul_v2.pbtxt b/paddle/fluid/operators/compat/matmul_v2.pbtxt
index cefb964a59f71286cbc4c685f6bf8e8fe8b2f672..5f43e1f8bf0e0c502566a2cc783b8927e5df56cc 100644
--- a/paddle/fluid/operators/compat/matmul_v2.pbtxt
+++ b/paddle/fluid/operators/compat/matmul_v2.pbtxt
@@ -39,28 +39,4 @@ extra {
     name: "op_device"
     type: STRING
   }
-  attrs {
-    name: "fused_reshape_X"
-    type: INTS
-  }
-  attrs {
-    name: "fused_reshape_Y"
-    type: INTS
-  }
-  attrs {
-    name: "fused_transpose_X"
-    type: INTS
-  }
-  attrs {
-    name: "fused_transpose_Y"
-    type: INTS
-  }
-  attrs {
-    name: "fused_reshape_Out"
-    type: INTS
-  }
-  attrs {
-    name: "fused_transpose_Out"
-    type: INTS
-  }
 }
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 09684b8d737bae2c2677e026f534581ec7ba881f..9d266b81d0babcef6f6cc152fbb29781296dbc0f 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -267,44 +267,6 @@ PD_REGISTER_GENERAL_KERNEL(
     ALL_LAYOUT,
     paddle::operators::FeedStringsKernel<phi::XPUContext>,
     ALL_DTYPE) {}
-#elif defined(PADDLE_WITH_ASCEND_CL)
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    npu,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_sparse_coo_tensor,
-    npu,
-    ALL_LAYOUT,
-    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    npu,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-#elif defined(PADDLE_WITH_MLU)
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    CustomMLU,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_sparse_coo_tensor,
-    CustomMLU,
-    ALL_LAYOUT,
-    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    CustomMLU,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 48a5d2e433a100061c4d8a903ea045a21828cf84..c122a07c9b1d4999694889b4aa94c51700dbd762 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 namespace paddle {
@@ -67,7 +67,7 @@ class ElementwiseAddCompositeGradOpMaker
     auto dy_ptr = this->GetOutputPtr(&dy);
     std::string dy_name = this->GetOutputName(dy);
     int axis = static_cast<int>(this->Attr<int>("axis"));
-    VLOG(3) << "Runing add_grad composite func";
+    VLOG(6) << "Runing add_grad composite func";
     prim::add_grad<prim::DescTensor>(x, y, out_grad, axis, dx_ptr, dy_ptr);
     this->RecoverOutputName(dx, dx_name);
     this->RecoverOutputName(dy, dy_name);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 41549ede1ebc6184b56db0026afe179b290e4281..97941aa82f3954c34c871f49f9175e639fdd47da 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 namespace paddle {
@@ -84,7 +84,7 @@ class ElementwiseDivCompositeGradOpMaker
     auto dy_ptr = this->GetOutputPtr(&dy);
     std::string dy_name = this->GetOutputName(dy);
     int axis = static_cast<int>(this->Attr<int>("axis"));
-    VLOG(3) << "Runing div_grad composite func";
+    VLOG(6) << "Runing div_grad composite func";
     prim::divide_grad<prim::DescTensor>(
         x, y, out, out_grad, axis, dx_ptr, dy_ptr);
     this->RecoverOutputName(dx, dx_name);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 740c9381d92e233ceb2be3de156a9a62e1ac22f5..9821cc226128323d48254f020f3470e919469b80 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 
@@ -88,7 +88,7 @@ class ElementwiseMulCompositeGradOpMaker
         static_cast<int>(this->Attr<int>("axis")),
         x_grad_p,
         y_grad_p);
-    VLOG(3) << "Runing mul_grad composite func";
+    VLOG(6) << "Runing mul_grad composite func";
     this->RecoverOutputName(x_grad, x_grad_name);
     this->RecoverOutputName(y_grad, y_grad_name);
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 2a9e14867acf1f3caf105a6b31c69d31f073df39..a7244062632699992533f851277563edce450998 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 namespace paddle {
@@ -70,7 +70,7 @@ class ElementwiseSubCompositeGradOpMaker
     auto dy_ptr = this->GetOutputPtr(&dy);
     std::string dy_name = this->GetOutputName(dy);
     int axis = static_cast<int>(this->Attr<int>("axis"));
-    VLOG(3) << "Runing sub_grad composite func";
+    VLOG(6) << "Runing sub_grad composite func";
     prim::subtract_grad<prim::DescTensor>(x, y, out_grad, axis, dx_ptr, dy_ptr);
     this->RecoverOutputName(dx, dx_name);
     this->RecoverOutputName(dy, dy_name);
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 3c05ab9295c6769bc7b949bc55bcd2321c063ba4..6df6422f7173c9cf0fcde2624d402484de85b322 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/phi/core/infermeta_utils.h"
@@ -206,7 +206,7 @@ class ExpandV2CompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     auto shape = this->Attr<std::vector<int>>("shape");
     prim::expand_grad<prim::DescTensor>(
         x, out_grad, paddle::experimental::IntArray(shape), x_grad_p);
-    VLOG(3) << "Runing expand_v2 composite func";
+    VLOG(6) << "Runing expand_v2 composite func";
     this->RecoverOutputName(x_grad, x_grad_name);
   }
 };
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 7d00dda19452c0f231260295f97fc35b8949c82d..347d1ba25215081eedb44c552e965ed65d1d921f 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -375,7 +375,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("BiasDropoutResidualOut",
               "Result of residual + dropout(src + bias).")
         .AsIntermediate();
-    AddOutput("CacheKVOut", "The udpated cache KV.");
+    AddOutput("CacheKVOut", "The udpated cache KV.").AsDispensable();
     AddOutput("Y", "Result after attention.");
 
     AddAttr<int>("num_heads", "The number head for multi_head_attention.")
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index a6fa80a4939728043b311127ee10db6c50325d56..7f877867050ed44af96fd5c897873279192e9843 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -35,16 +35,17 @@ class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
                    "Output",
                    "LnVariance",
                    "FusedBiasDropoutResidualLnOp");
-    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"),
-                   "Output",
-                   "BiasDropoutResidualOut",
-                   "FusedBiasDropoutResidualLnOp");
     OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"),
                    "Output",
                    "DropoutMaskOut",
                    "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"),
+                   "Output",
+                   "BiasDropoutResidualOut",
+                   "FusedBiasDropoutResidualLnOp");
     OP_INOUT_CHECK(
         ctx->HasOutput("Y"), "Output", "Y", "FusedBiasDropoutResidualLnOp");
+
     auto x_dim = ctx->GetInputDim("X");
     int left = 1;
     for (int i = 0; i < x_dim.size() - 1; i++) {
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index 2562c2cc225756c12241f8bb14fb7ac4508edb47..01a233950b279325437f54ca126c4e43eea2c5f0 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -54,8 +54,12 @@ class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(ln_mean, ln_mean->numel() * sizeof(U));
     auto *ln_var_data = dev_ctx.Alloc<U>(ln_var, ln_var->numel() * sizeof(U));
-    auto *dropout_mask_out_data = dev_ctx.Alloc<uint8_t>(
-        dropout_mask_out, dropout_mask_out->numel() * sizeof(uint8_t));
+    auto *dropout_mask_out_data =
+        (dropout_mask_out == nullptr)
+            ? nullptr
+            : dev_ctx.Alloc<uint8_t>(
+                  dropout_mask_out,
+                  dropout_mask_out->numel() * sizeof(uint8_t));
     auto *y_data = dev_ctx.Alloc<T>(y, y->numel() * sizeof(T));
 
     const auto input_x_dims = input_x->dims();
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index c65364d2818d1aab645d5780778ab289370579c7..0c4e10fa156f9f03fbb1e87c7ab26dde35bb9787 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -854,9 +854,10 @@ void LaunchLayernormResidualDropoutBias(
                  residual,
                  rows * cols * sizeof(T),
                  ctx.stream());
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
-        mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
-
+    if (mask_data != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
+          mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    }
     // call layernorm forward
     switch (GetDesiredBlockDim(cols)) {
       FIXED_BLOCK_DIM_CASE(
diff --git a/paddle/fluid/operators/fused/fused_matmul_op.cc b/paddle/fluid/operators/fused/fused_matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd204b53c1bd307ee6a90800cad1d9d31170ecd3
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_matmul_op.cc
@@ -0,0 +1,206 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+static std::vector<int64_t> GetInputShape(phi::DDim dim,
+                                          std::vector<int> shape,
+                                          std::vector<int> axis) {
+  PADDLE_ENFORCE_GT(dim.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(%s) has not been initialized properly. The "
+                        "shape of Input(%s) = [%s].",
+                        dim));
+
+  auto is_input_fused = (!shape.empty() && !axis.empty());
+  if (is_input_fused) {
+    dim = dim.reshape(shape).transpose(axis);
+  }
+  return phi::vectorize(dim);
+}
+
+class FusedMatmulOp : public MatMulV2Op {
+ public:
+  using MatMulV2Op::MatMulV2Op;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fused_matmul");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "fused_matmul");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fused_matmul");
+    bool trans_x = ctx->Attrs().Get<bool>("trans_x");
+    bool trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    std::vector<int64_t> dims_x =
+        GetInputShape(ctx->GetInputDim("X"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_reshape_X"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_transpose_X"));
+    std::vector<int64_t> dims_y =
+        GetInputShape(ctx->GetInputDim("Y"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_reshape_Y"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_transpose_Y"));
+
+    auto ndims_x = dims_x.size();
+    auto ndims_y = dims_y.size();
+    PADDLE_ENFORCE_GT(ndims_x,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The Input(X) dims size must be greater than 0,"
+                          " but received dims size is 0. "));
+    PADDLE_ENFORCE_GT(ndims_y,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The Input(Y) dims size must be greater than 0,"
+                          " but received dims size is 0. "));
+
+    bool x_broadcasted = false;
+    bool y_broadcasted = false;
+
+    if (ndims_x == 1) {
+      dims_x.insert(dims_x.begin(), 1);
+      ndims_x = 2;
+      x_broadcasted = true;
+    }
+
+    if (ndims_y == 1) {
+      dims_y.push_back(1);
+      ndims_y = 2;
+      y_broadcasted = true;
+    }
+
+    size_t M, N;
+    if (trans_x) {
+      M = dims_x[ndims_x - 1];
+    } else {
+      M = dims_x[ndims_x - 2];
+    }
+    if (trans_y) {
+      N = dims_y[ndims_y - 2];
+    } else {
+      N = dims_y[ndims_y - 1];
+    }
+
+    std::vector<int64_t> new_dims;
+    if (ndims_x > ndims_y) {
+      new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+    } else if (ndims_x < ndims_y) {
+      new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+    } else {
+      new_dims.reserve(ndims_x);
+      for (size_t i = 0; i < ndims_x - 2; ++i) {
+        new_dims.push_back(std::max(dims_x[i], dims_y[i]));
+      }
+    }
+    if (!x_broadcasted) {
+      new_dims.push_back(M);
+    }
+    if (!y_broadcasted) {
+      new_dims.push_back(N);
+    }
+    if (x_broadcasted && y_broadcasted) {
+      new_dims.push_back(1);
+    }
+
+    auto ddim_out = phi::make_ddim(new_dims);
+
+    auto shape = ctx->Attrs().Get<std::vector<int>>("fused_reshape_Out");
+    auto axis = ctx->Attrs().Get<std::vector<int>>("fused_transpose_Out");
+
+    auto is_output_fused = (!shape.empty() && !axis.empty());
+    if (is_output_fused) {
+      ddim_out = ddim_out.transpose(axis).reshape(shape);
+    }
+
+    ctx->SetOutputDim("Out", ddim_out);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class FusedMatmulOpMaker : public MatMulV2OpMaker {
+ protected:
+  void Apply() override {
+    AddInput("ResidualData",
+             "Extra input from matmul_elementwise_add_mkldnn_fuse_pass")
+        .AsDispensable()
+        .AsExtra();
+    AddAttr<float>("matmul_alpha", "Output scale used in matmul_v1")
+        .SetDefault(1.0f);
+    AddAttr<std::string>(
+        "fuse_activation",
+        "Activation type from matmul_activation_mkldnn_fuse_pass")
+        .SetDefault("");
+    AddAttr<float>("fuse_alpha",
+                   "Activation alpha from matmul_activation_mkldnn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fuse_beta",
+                   "Activation beta from matmul_activation_mkldnn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fused_output_scale",
+                   "Output scale from operator_scale_onednn_fuse_pass")
+        .SetDefault(1.0f);
+    AddAttr<std::vector<int>>("fused_reshape_X",
+                              "Reshape's shape attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_transpose_X",
+                              "Transpose's axis attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_reshape_Y",
+                              "Reshape's shape attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_transpose_Y",
+                              "Transpose's axis attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_reshape_Out",
+                              "Reshape's shape attribute from "
+                              "matmul_transpose_reshape_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_transpose_Out",
+                              "Transpose's axis attribute from "
+                              "matmul_transpose_reshape_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::string>("mkldnn_data_type", "oneDNN operator data type")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    AddAttr<float>("Scale_x", "Matmul X input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_y", "Matmul Y input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_in_eltwise", "Matmul ResidualData quantization scale")
+        .SetDefault(0.0f);
+    AddAttr<float>("Scale_out", "Matmul output quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<bool>("force_fp32_output",
+                  "Flag determining if output should be converted to FP32")
+        .SetDefault(false);
+    AddComment(
+        R"DOC(Matrix multiplication extended with oneDNN-specific fusion logic.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_matmul,
+    ops::FusedMatmulOp,
+    ops::FusedMatmulOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/generator/templates/op.c.j2 b/paddle/fluid/operators/generator/templates/op.c.j2
index 2339822af280fb2050d3e84dea3daa22395913e3..f54f91073da158fc1f144a7bb161dab583f59b09 100644
--- a/paddle/fluid/operators/generator/templates/op.c.j2
+++ b/paddle/fluid/operators/generator/templates/op.c.j2
@@ -5,7 +5,7 @@
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
index a471efaa562b4cb579ae9ddd1f38193567f7d392..63392bb786f0c90befbfe2312cdec9b6d6d59660 100644
--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -665,7 +665,7 @@ class {{op_name | to_composite_grad_opmaker_name}} : public prim::CompositeGradO
 {%- endmacro %}
 
 {% macro call_composite_backward_api(composite_func_info) %}
-    VLOG(3) << "Runing {{composite_func_info["func_name"]}} composite func";
+    VLOG(6) << "Runing {{composite_func_info["func_name"]}} composite func";
     prim::{{composite_func_info["func_name"]}}<prim::DescTensor>({{composite_func_info["func_args"]}});
 {%- endmacro %}
 
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index c52fc08c91d5258d3c52b44234f74b3b3474b442..dee182ca1034cbba566622fd6aba31a76f91ed82 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,168 +24,131 @@
 namespace paddle {
 namespace operators {
 
-static framework::DDim GetDimForInput(const framework::InferShapeContext& ctx,
-                                      const std::string input_name) {
-  auto shape = ctx.Attrs().Get<std::vector<int>>("fused_reshape_" + input_name);
-  auto axis =
-      ctx.Attrs().Get<std::vector<int>>("fused_transpose_" + input_name);
-  auto dim = ctx.GetInputDim(input_name);
-
-  PADDLE_ENFORCE_GT(dim.size(),
+void MatMulV2Op::InferShape(framework::InferShapeContext* ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matmul_v2");
+  OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "matmul_v2");
+  OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matmul_v2");
+  bool trans_x = ctx->Attrs().Get<bool>("trans_x");
+  bool trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+  std::vector<int64_t> dims_x = phi::vectorize(ctx->GetInputDim("X"));
+  std::vector<int64_t> dims_y = phi::vectorize(ctx->GetInputDim("Y"));
+  auto ndims_x = dims_x.size();
+  auto ndims_y = dims_y.size();
+  PADDLE_ENFORCE_GT(ndims_x,
                     0,
-                    platform::errors::InvalidArgument(
-                        "The Input(%s) has not been initialized properly. The "
-                        "shape of Input(%s) = [%s].",
-                        dim));
-
-  if (!shape.empty() && !axis.empty()) {
-    dim = dim.reshape(shape).transpose(axis);
-  }
-  return dim;
-}
-
-class MatMulV2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matmul_v2");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "matmul_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matmul_v2");
-    bool trans_x = ctx->Attrs().Get<bool>("trans_x");
-    bool trans_y = ctx->Attrs().Get<bool>("trans_y");
-
-    std::vector<int64_t> dims_x = phi::vectorize(GetDimForInput(*ctx, "X"));
-    std::vector<int64_t> dims_y = phi::vectorize(GetDimForInput(*ctx, "Y"));
-    auto ndims_x = dims_x.size();
-    auto ndims_y = dims_y.size();
-    PADDLE_ENFORCE_GT(ndims_x,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) dims size must be greater than 0,"
-                          " but received dims size is 0. "));
-    PADDLE_ENFORCE_GT(ndims_y,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The Input(Y) dims size must be greater than 0,"
-                          " but received dims size is 0. "));
-
-    bool x_broadcasted = false, y_broadcasted = false;
-    if (ndims_x == 1) {
-      dims_x.insert(dims_x.begin(), 1);
-      ndims_x = 2;
-      x_broadcasted = true;
-    }
-
-    if (ndims_y == 1) {
-      dims_y.push_back(1);
-      ndims_y = 2;
-      y_broadcasted = true;
-    }
+                    phi::errors::InvalidArgument(
+                        "The Input(X) dims size must be greater than 0,"
+                        " but received dims size is 0. "));
+  PADDLE_ENFORCE_GT(ndims_y,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(Y) dims size must be greater than 0,"
+                        " but received dims size is 0. "));
 
-    size_t M, N;
-    if (trans_x) {
-      M = dims_x[ndims_x - 1];
-    } else {
-      M = dims_x[ndims_x - 2];
-    }
-    if (trans_y) {
-      N = dims_y[ndims_y - 2];
-    } else {
-      N = dims_y[ndims_y - 1];
-    }
+  bool x_broadcasted = false;
+  bool y_broadcasted = false;
 
-    std::vector<int64_t> new_dims;
-    if (ndims_x > ndims_y) {
-      new_dims.assign(dims_x.begin(), dims_x.end() - 2);
-    } else if (ndims_x < ndims_y) {
-      new_dims.assign(dims_y.begin(), dims_y.end() - 2);
-    } else {
-      new_dims.reserve(ndims_x);
-      for (size_t i = 0; i < ndims_x - 2; ++i) {
-        new_dims.push_back(std::max(dims_x[i], dims_y[i]));
-      }
-    }
-    if (!x_broadcasted) {
-      new_dims.push_back(M);
-    }
-    if (!y_broadcasted) {
-      new_dims.push_back(N);
-    }
-    if (x_broadcasted && y_broadcasted) {
-      new_dims.push_back(1);
-    }
+  if (ndims_x == 1) {
+    dims_x.insert(dims_x.begin(), 1);
+    ndims_x = 2;
+    x_broadcasted = true;
+  }
 
-    auto ddim_out = phi::make_ddim(new_dims);
+  if (ndims_y == 1) {
+    dims_y.push_back(1);
+    ndims_y = 2;
+    y_broadcasted = true;
+  }
 
-#ifdef PADDLE_WITH_MKLDNN
-    auto shape = ctx->Attrs().Get<std::vector<int>>("fused_reshape_Out");
-    auto axis = ctx->Attrs().Get<std::vector<int>>("fused_transpose_Out");
+  size_t M, N;
+  if (trans_x) {
+    M = dims_x[ndims_x - 1];
+  } else {
+    M = dims_x[ndims_x - 2];
+  }
+  if (trans_y) {
+    N = dims_y[ndims_y - 2];
+  } else {
+    N = dims_y[ndims_y - 1];
+  }
 
-    if (!shape.empty() && !axis.empty()) {
-      ddim_out = ddim_out.transpose(axis).reshape(shape);
+  std::vector<int64_t> new_dims;
+  if (ndims_x > ndims_y) {
+    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+  } else if (ndims_x < ndims_y) {
+    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  } else {
+    new_dims.reserve(ndims_x);
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
     }
-#endif
-
-    ctx->SetOutputDim("Out", ddim_out);
-    ctx->ShareLoD("X", "Out");
   }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  if (!x_broadcasted) {
+    new_dims.push_back(M);
+  }
+  if (!y_broadcasted) {
+    new_dims.push_back(N);
+  }
+  if (x_broadcasted && y_broadcasted) {
+    new_dims.push_back(1);
   }
 
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    if (framework::IsComplexType(expected_kernel_type.dtype())) {
-      // only promote inputs’s types when contains complex input
-      return phi::KernelKey(tensor.place(), tensor.layout(), tensor.dtype());
-    } else {
+  ctx->SetOutputDim("Out", phi::make_ddim(new_dims));
+  ctx->ShareLoD("X", "Out");
+}
+
+phi::KernelKey MatMulV2Op::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  auto input_data_type =
+      OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
+  return phi::KernelKey(input_data_type, ctx.GetPlace());
+}
+
+phi::KernelKey MatMulV2Op::GetKernelTypeForVar(
+    const std::string& var_name,
+    const phi::DenseTensor& tensor,
+    const phi::KernelKey& expected_kernel_type) const {
+  if (framework::IsComplexType(expected_kernel_type.dtype())) {
+    // only promote inputs’s types when contains complex input
+    return phi::KernelKey(tensor.place(), tensor.layout(), tensor.dtype());
+  } else {
 #ifdef PADDLE_WITH_MKLDNN
-      // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN
-      // op previously) then we also need to rotate shape NHWC -> NCWH
-      if ((expected_kernel_type.layout() == phi::DataLayout::ONEDNN) &&
-          (tensor.layout() != phi::DataLayout::ONEDNN) &&
-          phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
-              phi::DataLayout::kNHWC) {
-        return phi::KernelKey(tensor.place(),
-                              phi::DataLayout::kNHWC,
-                              expected_kernel_type.dtype());
-      }
-#endif
+    // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN
+    // op previously) then we also need to rotate shape NHWC -> NCWH
+    if ((expected_kernel_type.layout() == phi::DataLayout::ONEDNN) &&
+        (tensor.layout() != phi::DataLayout::ONEDNN) &&
+        phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
+            phi::DataLayout::kNHWC) {
       return phi::KernelKey(
-          tensor.place(), tensor.layout(), expected_kernel_type.dtype());
+          tensor.place(), phi::DataLayout::kNHWC, expected_kernel_type.dtype());
     }
+#endif
+    return phi::KernelKey(
+        tensor.place(), tensor.layout(), expected_kernel_type.dtype());
   }
-};
+}
 
-class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "tensor of shape (d0, d1 ... M, K)");
-    AddInput("Y", "tensor of shape (d0, d1 ... K, N)");
-    AddOutput("Out", "tensor of shape (d0, d1 ... M, N)");
-    AddAttr<bool>("trans_x",
-                  "Set true to transpose the last two dimensions of X before "
-                  "doing multiplication")
-        .SetDefault(false);
-    AddAttr<bool>("trans_y",
-                  "Set true to transpose the last two dimensions of Y before "
-                  "doing multiplication")
-        .SetDefault(false);
-    AddComment(
-        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K),
+void MatMulV2OpMaker::Make() {
+  AddInput("X", "tensor of shape (d0, d1 ... M, K)");
+  AddInput("Y", "tensor of shape (d0, d1 ... K, N)");
+  AddOutput("Out", "tensor of shape (d0, d1 ... M, N)");
+  AddAttr<bool>("trans_x",
+                "Set true to transpose the last two dimensions of X before "
+                "doing multiplication")
+      .SetDefault(false);
+  AddAttr<bool>("trans_y",
+                "Set true to transpose the last two dimensions of Y before "
+                "doing multiplication")
+      .SetDefault(false);
+  AddComment(
+      R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K),
         B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)).
         In addition, it also follows the broadcast rule which is similar as
         numpy.matmul.
 )DOC");
-  }
-};
+  Apply();
+}
 
 class MatMulV2OpGrad : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 70bdd0736bf4ecc2322d0a6e5c8d34c320d8a8f5..a27bf5a33e2f8fc302c033875397667c6ab727ff 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -37,6 +37,29 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+class MatMulV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+
+  phi::KernelKey GetKernelTypeForVar(
+      const std::string& var_name,
+      const phi::DenseTensor& tensor,
+      const phi::KernelKey& expected_kernel_type) const override;
+};
+
+class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final;
+
+ protected:
+  virtual void Apply() {}
+};
+
 // Reshape a rank-3 tensor from P x M x N to (P * M) x N.
 // Identity op if the tensor is not of rank 3.
 static phi::DenseTensor FoldInitDims(const phi::DenseTensor& input) {
diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h
index 94f0fa2a606c3642e835d8184e98186b14bed3e5..02624b9a49fbaaee4c4eb9ca733774823f663493 100644
--- a/paddle/fluid/operators/ops_extra_info.h
+++ b/paddle/fluid/operators/ops_extra_info.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -102,12 +102,6 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
         {"fused_output_scale", ExtraAttrProperty::ONEDNN},
         {"fuse_residual_connection", ExtraAttrProperty::ONEDNN},
         {"fuse_with_relu", ExtraAttrProperty::ONEDNN},
-        {"fused_reshape_Out", ExtraAttrProperty::ONEDNN},
-        {"fused_transpose_Out", ExtraAttrProperty::ONEDNN},
-        {"fused_reshape_X", ExtraAttrProperty::ONEDNN},
-        {"fused_reshape_Y", ExtraAttrProperty::ONEDNN},
-        {"fused_transpose_X", ExtraAttrProperty::ONEDNN},
-        {"fused_transpose_Y", ExtraAttrProperty::ONEDNN},
         {"mkldnn_data_type", ExtraAttrProperty::ONEDNN},
         {"scale_x", ExtraAttrProperty::ONEDNN},
         {"scale_y", ExtraAttrProperty::ONEDNN},
@@ -226,8 +220,7 @@ class ExtraInfoUtils {
   std::unordered_map<std::string, std::vector<std::string>>
       g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}},
                                   {"conv2d_transpose", {"Bias"}},
-                                  {"conv2d_grad", {"Bias"}},
-                                  {"matmul_v2", {"ResidualData"}}};
+                                  {"conv2d_grad", {"Bias"}}};
   std::vector<std::string> empty_extra_input_names_;
 };
 
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 74b98069bf647fa153978790cf8934fc5968074c..79262db30fafbaf5fd3accc098b0938bb7ec14e4 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -27,6 +27,11 @@ inline int MaxPoolOutputSize(int input_size,
                              int filter_size,
                              int padding,
                              int stride) {
+  PADDLE_ENFORCE_NE(
+      stride,
+      0,
+      phi::errors::InvalidArgument(
+          "The stride of MaxPool shall not be 0, but received %d.", stride));
   int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
   return output_size;
 }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ecf8119ed2a1917e0a291359f781cf4a67378d26..1f1ea5c005db7f90de5d3473f9ed43def3243b1d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -141,8 +141,22 @@ void HandleLargeDim(const framework::ExecutionContext& context,
 
   // transpose to 2D tensor whose shape is {unreduced, reduced}.
   const int64_t unreduced = output->numel();
-  const int64_t reduced = shuffled_input.numel() / unreduced;
+  const int64_t input_numel = shuffled_input.numel();
+  // assume: 0 / 0 == 0, which allow process 0 dim tensor
+  const int64_t reduced = (unreduced != 0) ? (input_numel / unreduced) : 0;
+
+  PADDLE_ENFORCE_EQ(
+      unreduced * reduced,
+      input_numel,
+      phi::errors::InvalidArgument(
+          "Reducing failed in HandleLargeDim, when try to transpose (%d) "
+          "operands into 2D tensor with shape (%d, %d).",
+          input_numel,
+          unreduced,
+          reduced));
+
   shuffled_input.Resize({unreduced, reduced});
+
   DDim output_dim = output->dims();
   output->Resize({unreduced});
   paddle::operators::ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
@@ -163,7 +177,20 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
                         Functor functor,
                         const std::vector<int>& dims) {
   const int64_t unreduced = out->numel();
-  const int64_t reduced = x->numel() / unreduced;
+  const int64_t x_numel = x->numel();
+  // assume: 0 / 0 == 0, which allow process 0 dim tensor
+  const int64_t reduced = (unreduced != 0) ? (x_numel / unreduced) : 0;
+
+  PADDLE_ENFORCE_EQ(
+      unreduced * reduced,
+      x_numel,
+      phi::errors::InvalidArgument(
+          "Reducing failed in HandleLargeDimGrad, when try to transpose (%d) "
+          "operands into 2D tensor with shape (%d, %d).",
+          x_numel,
+          unreduced,
+          reduced));
+
   DDim out_dim(out->dims());
   DDim x_dim(x->dims());
   // transpose and reshape X
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 25e6ad9b65cc0662fd3ee5f1811cc1d20f2473f3..5d14a0911fb2da0ddf209ad0bae792081832a801 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
+#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -84,7 +84,7 @@ class ReduceSumCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
 
     // get output orginal name
     std::string x_grad_name = this->GetOutputName(x_grad_t);
-    VLOG(3) << "Runing sum_grad composite func";
+    VLOG(6) << "Runing sum_grad composite func";
     // call composite backward func
     prim::sum_grad<prim::DescTensor>(
         x, out_grad, axis, keep_dim, reduce_all, x_grad);
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 579549a4c3ec476b979f7dd7919a1b8a38850a7c..2f5da3c44b97fd41025e90e296be121027a9a379 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -438,11 +438,32 @@ class TensorRTEngineOp : public framework::OperatorBase {
       calib_res->calib_.reset(new TRTInt8Calibrator(
           calib_buffers, runtime_batch, calibration_engine_key_, dev_place));
       calib_res->thr_.reset(new std::thread([&]() {
+        std::map<std::string, std::vector<int>> min_input_shape;
+        std::map<std::string, std::vector<int>> max_input_shape;
+        std::map<std::string, std::vector<int>> opt_input_shape;
+        std::map<std::string, std::vector<int>> min_shape_tensor;
+        std::map<std::string, std::vector<int>> max_shape_tensor;
+        std::map<std::string, std::vector<int>> opt_shape_tensor;
+        if (shape_range_info_path_.size())
+          inference::DeserializeShapeRangeInfo(shape_range_info_path_,
+                                               &min_input_shape,
+                                               &max_input_shape,
+                                               &opt_input_shape,
+                                               &min_shape_tensor,
+                                               &max_shape_tensor,
+                                               &opt_shape_tensor);
+
         calib_res->engine_.reset(new TensorRTEngine(max_batch_size_,
                                                     workspace_size_,
                                                     precision_mode_,
                                                     calib_res->calib_.get(),
-                                                    dev_place.device));
+                                                    dev_place.device,
+                                                    min_input_shape,
+                                                    max_input_shape,
+                                                    opt_input_shape,
+                                                    min_shape_tensor,
+                                                    max_shape_tensor,
+                                                    opt_shape_tensor));
         VLOG(3) << "start the calib trt engine thread";
         PrepareTRTEngine(scope, calib_res->engine_.get());
       }));
diff --git a/paddle/fluid/prim/api/.gitignore b/paddle/fluid/prim/api/.gitignore
index 377e800f00a0e08893961c1910c9b479e3143181..2dad6249be3ef8332b3df8788569ef77d8fc385d 100644
--- a/paddle/fluid/prim/api/.gitignore
+++ b/paddle/fluid/prim/api/.gitignore
@@ -1,3 +1,2 @@
-generated/prim_api/eager_prim_api.cc
-generated/prim_api/tmp_eager_prim_api.cc
-generated/prim_api/*.h
+generated_prim/*.cc
+generated_prim/*.h
diff --git a/paddle/fluid/prim/api/CMakeLists.txt b/paddle/fluid/prim/api/CMakeLists.txt
index 436cecc32582b39cfe08b2a06f9d4dba55387f50..6cf3dacef9f4ec8fc07a28530335b8fc8833be39 100644
--- a/paddle/fluid/prim/api/CMakeLists.txt
+++ b/paddle/fluid/prim/api/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(auto_code_generated)
-add_subdirectory(manual)
-add_subdirectory(generated)
+add_subdirectory(manual_prim)
+add_subdirectory(generated_prim)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
diff --git a/paddle/fluid/prim/api/all.h b/paddle/fluid/prim/api/all.h
index 2996d2aa2657c8b8c09cfabd30daa7c2adf707b6..b275e163cbc88d21e128d1ed71991193eeb8e76e 100644
--- a/paddle/fluid/prim/api/all.h
+++ b/paddle/fluid/prim/api/all.h
@@ -13,6 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
-#include "paddle/fluid/prim/api/manual/backward/composite_backward_api.h"
-#include "paddle/fluid/prim/api/manual/utils/utils.h"
+#include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
+#include "paddle/fluid/prim/api/manual_prim/prim_manual_api.h"
+#include "paddle/fluid/prim/api/manual_prim/utils/utils.h"
diff --git a/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt b/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt
index e36af681bbd89589d58e5a7003beacb83ff08c24..ebff0ec688a7e55a3da8f6a98586fd0d36eae71e 100644
--- a/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt
+++ b/paddle/fluid/prim/api/auto_code_generated/CMakeLists.txt
@@ -5,16 +5,17 @@ set(legacy_api_yaml_path
     "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/legacy_ops.parsed.yaml"
 )
 set(tmp_eager_prim_api_cc_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_eager_prim_api.cc"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated_prim/tmp_eager_prim_api.cc"
 )
 set(tmp_prim_api_h_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/tmp_prim_api.h"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated_prim/tmp_prim_generated_api.h"
 )
 set(eager_prim_api_cc_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated_prim/eager_prim_api.cc"
 )
 set(prim_api_h_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated/prim_api/prim_api.h")
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
+)
 set(prim_api_gen_file
     ${PADDLE_SOURCE_DIR}/paddle/fluid/prim/api/auto_code_generated/prim_gen.py)
 
diff --git a/paddle/fluid/prim/api/auto_code_generated/prim_gen.py b/paddle/fluid/prim/api/auto_code_generated/prim_gen.py
index 7bc59df4f33d2de7bdbf76737461f0b848865c36..787eeb3e4409f1461e1cd41a351fd0f48e0aa739 100644
--- a/paddle/fluid/prim/api/auto_code_generated/prim_gen.py
+++ b/paddle/fluid/prim/api/auto_code_generated/prim_gen.py
@@ -28,11 +28,11 @@ def header_include():
 """
 
 
-def eager_source_include(header_file_path):
+def eager_source_include():
     return """
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
+#include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
 """
 
 
@@ -73,10 +73,7 @@ def generate_api(api_yaml_path, header_file_path, eager_prim_source_file_path):
     header_file.write(header_include())
     header_file.write(namespace[0])
     header_file.write(namespace[1])
-    include_header_file = (
-        "#include paddle/fluid/prim/api/generated/prim_api/prim_api.h"
-    )
-    eager_prim_source_file.write(eager_source_include(include_header_file))
+    eager_prim_source_file.write(eager_source_include())
     eager_prim_source_file.write(namespace[0])
 
     for api in apis:
@@ -106,13 +103,13 @@ def main():
     parser.add_argument(
         '--prim_api_header_path',
         help='output of generated prim_api header code file',
-        default='paddle/fluid/prim/api/generated/prim_api/prim_api.h',
+        default='paddle/fluid/prim/api/generated_prim/prim_generated_api.h',
     )
 
     parser.add_argument(
         '--eager_prim_api_source_path',
         help='output of generated eager_prim_api source code file',
-        default='paddle/fluid/prim/api/generated/prim_api/eager_prim_api.cc',
+        default='paddle/fluid/prim/api/generated_prim/eager_prim_api.cc',
     )
 
     options = parser.parse_args()
diff --git a/paddle/fluid/prim/api/manual/backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
similarity index 96%
rename from paddle/fluid/prim/api/manual/backward/composite_backward_api.h
rename to paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 99ef82d08881c28f13cc88a18371c23d447c88d8..e782d6b65bba62ac47615e5f4de4b339575781ee 100644
--- a/paddle/fluid/prim/api/manual/backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -13,9 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
-#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
-#include "paddle/fluid/prim/api/manual/utils/utils.h"
+#include "paddle/fluid/prim/api/all.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -232,8 +230,8 @@ void multiply_grad(const Tensor& x,
                    Tensor* y_grad) {
   if (x_grad) {
     auto x_grad_unreduce = multiply<T>(out_grad, y);
-    if (x.dims() != y.dims()) {
-      auto axes = get_reduce_dims(x.dims(), y.dims());
+    if (x_grad_unreduce.dims() != x.dims()) {
+      auto axes = get_reduce_dims_from_out(x_grad_unreduce.dims(), x.dims());
       if (!axes.size()) {
         set_output<T>(x_grad_unreduce, x_grad);
       } else {
@@ -252,8 +250,8 @@ void multiply_grad(const Tensor& x,
   }
   if (y_grad) {
     auto y_grad_unreduce = multiply<T>(out_grad, x);
-    if (y.dims() != x.dims()) {
-      auto axes = get_reduce_dims(y.dims(), x.dims());
+    if (y_grad_unreduce.dims() != y.dims()) {
+      auto axes = get_reduce_dims_from_out(y_grad_unreduce.dims(), y.dims());
       if (!axes.size()) {
         set_output<T>(y_grad_unreduce, y_grad);
       } else {
diff --git a/paddle/fluid/prim/api/generated/CMakeLists.txt b/paddle/fluid/prim/api/generated/CMakeLists.txt
deleted file mode 100644
index a1b75527c20b49d688bde9ea120a74046a411123..0000000000000000000000000000000000000000
--- a/paddle/fluid/prim/api/generated/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(prim_api)
diff --git a/paddle/fluid/prim/api/generated/prim_api/CMakeLists.txt b/paddle/fluid/prim/api/generated_prim/CMakeLists.txt
similarity index 63%
rename from paddle/fluid/prim/api/generated/prim_api/CMakeLists.txt
rename to paddle/fluid/prim/api/generated_prim/CMakeLists.txt
index ee39c73f99f2f935664959f292884c7c95103452..6e030052d77a0fe1d73c06b94401c55650567b95 100644
--- a/paddle/fluid/prim/api/generated/prim_api/CMakeLists.txt
+++ b/paddle/fluid/prim/api/generated_prim/CMakeLists.txt
@@ -1,8 +1,3 @@
-cc_library(
-  static_prim_api
-  SRCS static_prim_api.cc
-  DEPS proto_desc static_utils)
-
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     eager_prim_api
diff --git a/paddle/fluid/prim/api/manual/CMakeLists.txt b/paddle/fluid/prim/api/manual/CMakeLists.txt
deleted file mode 100644
index 512d2b1553c8c94a06445f3c59c4b77d10d74032..0000000000000000000000000000000000000000
--- a/paddle/fluid/prim/api/manual/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(utils)
diff --git a/paddle/fluid/prim/api/manual_prim/CMakeLists.txt b/paddle/fluid/prim/api/manual_prim/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7437c737a7b7f6bc8a3fec463b0d916dc680b87d
--- /dev/null
+++ b/paddle/fluid/prim/api/manual_prim/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(utils)
+cc_library(
+  static_prim_api
+  SRCS static_prim_api.cc
+  DEPS proto_desc static_utils)
diff --git a/paddle/fluid/prim/api/manual/prim_api/prim_api.h b/paddle/fluid/prim/api/manual_prim/prim_manual_api.h
similarity index 78%
rename from paddle/fluid/prim/api/manual/prim_api/prim_api.h
rename to paddle/fluid/prim/api/manual_prim/prim_manual_api.h
index 65d411d86307ded238a4bc07e6336659663ca406..80d11aed3489e6d781673fd3ef5c3a6f36e9e49b 100644
--- a/paddle/fluid/prim/api/manual/prim_api/prim_api.h
+++ b/paddle/fluid/prim/api/manual_prim/prim_manual_api.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,15 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// prim api which can't be generated
 #pragma once
 
+#include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/utils/optional.h"
-
+// TODO(jiabin): Make this Header only for handwritten api, instead of include
+// prim_generated_api.h
 namespace paddle {
 namespace prim {}  // namespace prim
 }  // namespace paddle
diff --git a/paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
similarity index 96%
rename from paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc
rename to paddle/fluid/prim/api/manual_prim/static_prim_api.cc
index 30a82b4989972b4a0dd6f24b077b0c662306115e..71d547c139a1fff7881afdcf85204f1dbaf3ba2b 100644
--- a/paddle/fluid/prim/api/generated/prim_api/static_prim_api.cc
+++ b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
@@ -26,9 +26,8 @@
 #include "paddle/fluid/framework/program_desc.h"
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/prim/api/generated/prim_api/prim_api.h"
-#include "paddle/fluid/prim/api/manual/prim_api/prim_api.h"
-#include "paddle/fluid/prim/api/manual/utils/utils.h"
+#include "paddle/fluid/prim/api/manual_prim/prim_manual_api.h"
+#include "paddle/fluid/prim/api/manual_prim/utils/utils.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/phi/api/include/tensor.h"
@@ -110,6 +109,7 @@ Tensor unsqueeze<DescTensor>(const Tensor& x, const IntArray& axis) {
   op->SetAttr("axes", new_shape);
   op->CheckAttrs();
   op->InferVarType(block);
+  op->InferShape(*block);
   return out;
 }
 
@@ -209,7 +209,7 @@ Tensor sum<DescTensor>(const Tensor& x,
       "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
   op->CheckAttrs();
   op->InferVarType(block);
-  // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
+  op->InferShape(*block);
   return out;
 }
 
@@ -232,7 +232,7 @@ Tensor reshape<DescTensor>(const Tensor& x, const IntArray& shape) {
       "Out", {std::static_pointer_cast<prim::DescTensor>(out.impl())->Name()});
   op->CheckAttrs();
   op->InferVarType(block);
-  // TODO(jiabin, cxxly): This may have runtime shape skip infershape for now.
+  op->InferShape(*block);
   return out;
 }
 
diff --git a/paddle/fluid/prim/api/manual/utils/CMakeLists.txt b/paddle/fluid/prim/api/manual_prim/utils/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/prim/api/manual/utils/CMakeLists.txt
rename to paddle/fluid/prim/api/manual_prim/utils/CMakeLists.txt
diff --git a/paddle/fluid/prim/api/manual/utils/eager_utils.cc b/paddle/fluid/prim/api/manual_prim/utils/eager_utils.cc
similarity index 97%
rename from paddle/fluid/prim/api/manual/utils/eager_utils.cc
rename to paddle/fluid/prim/api/manual_prim/utils/eager_utils.cc
index 353945557f1d02386645a79c6b2d871fe90fb588..04854428d8e2b9ce0247ce296b09bfb1b515e895 100644
--- a/paddle/fluid/prim/api/manual/utils/eager_utils.cc
+++ b/paddle/fluid/prim/api/manual_prim/utils/eager_utils.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
-#include "paddle/fluid/prim/api/manual/utils/utils.h"
+#include "paddle/fluid/prim/api/manual_prim/utils/utils.h"
 #include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/prim/api/manual/utils/static_utils.cc b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
similarity index 98%
rename from paddle/fluid/prim/api/manual/utils/static_utils.cc
rename to paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
index 74656cfe7d48d17fe0c3fc2122896ef10f8535b7..8cfcffd92c2ea18a3f0723df282493e0052b01b6 100644
--- a/paddle/fluid/prim/api/manual/utils/static_utils.cc
+++ b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/prim/api/manual/utils/utils.h"
+#include "paddle/fluid/prim/api/manual_prim/utils/utils.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 #include "paddle/phi/api/include/tensor.h"
diff --git a/paddle/fluid/prim/api/manual/utils/utils.h b/paddle/fluid/prim/api/manual_prim/utils/utils.h
similarity index 100%
rename from paddle/fluid/prim/api/manual/utils/utils.h
rename to paddle/fluid/prim/api/manual_prim/utils/utils.h
diff --git a/paddle/fluid/prim/tests/test_static_prim.cc b/paddle/fluid/prim/tests/test_static_prim.cc
index 313a3ccc99b74de65305d8d8d1b07f06760e4593..5a53101ab13bb01c8336cf462873517a48596dcc 100644
--- a/paddle/fluid/prim/tests/test_static_prim.cc
+++ b/paddle/fluid/prim/tests/test_static_prim.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/prim/api/manual/utils/utils.h"
+#include "paddle/fluid/prim/api/manual_prim/utils/utils.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/fluid/prim/utils/utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
index e391d8ac5300b184c8d46e9cdc26983bec037fb8..efb2479e4051444fb51a3ecf10f9a9b83598498a 100644
--- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
+++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
@@ -57,6 +57,8 @@ class CompositeGradOpMakerBase {
         acting_program_(framework::ProgramDesc()),
         grad_block_(grad_block) {
     // TODO(jiabin): This should always execute by one thread...
+    VLOG(6) << "Constructing Composite Grad func for " << fwd_op_.Type()
+            << "_grad ";
     StaticCompositeContext::Instance().SetBlock(
         acting_program_.MutableBlock(0));
   }
@@ -64,6 +66,7 @@ class CompositeGradOpMakerBase {
   virtual ~CompositeGradOpMakerBase() = default;
 
   virtual std::vector<std::unique_ptr<framework::OpDesc>> operator()() {
+    VLOG(3) << "Runing Composite Grad func for " << fwd_op_.Type() << "_grad ";
     this->Apply();
     std::vector<std::unique_ptr<framework::OpDesc>> ops;
     // TODO(jiabin): Support multiple blocks later
@@ -318,6 +321,7 @@ class CompositeGradOpMakerBase {
       grad_var_name = framework::kEmptyVarName;
       if (drop_empty_grad) return nullptr;
     }
+
     if (original_block_->HasVar(grad_var_name)) {
       // Copy Var from original block to active block, or create a new one.
       CopyVarFromOrig(grad_var_name);
@@ -333,6 +337,12 @@ class CompositeGradOpMakerBase {
     auto grad_var_name = framework::GradVarName(var_name);
     (*this->grad_to_var_)[grad_var_name] = var_name;
     VLOG(8) << "Valid gradients: " << grad_var_name;
+
+    auto target_grad = StaticCompositeContext::Instance().GetTargetGradName();
+    if (target_grad.find(grad_var_name) != target_grad.end()) {
+      grad_var_name = target_grad.at(grad_var_name);
+    }
+
     if (original_block_->HasVar(grad_var_name)) {
       // Copy Var from original block to active block, or create a new one.
       CopyVarFromOrig(grad_var_name);
@@ -421,7 +431,11 @@ class CompositeGradOpMakerBase {
                      return g_name;
                    });
     std::vector<framework::VarDesc*> grad_out;
-    for (const auto& name : ret_val) {
+    for (auto name : ret_val) {
+      auto target_grad = StaticCompositeContext::Instance().GetTargetGradName();
+      if (target_grad.find(name) != target_grad.end()) {
+        name = target_grad.at(name);
+      }
       // TODO(jiabin): Will this cause fill zeros error?
       if (original_block_->HasVar(name)) {
         // Copy Var from original block to active block, or create a new one.
diff --git a/paddle/fluid/prim/utils/static/static_global_utils.h b/paddle/fluid/prim/utils/static/static_global_utils.h
index 08407013673621a364c177aa1c453e8904fcac63..e6a8054f1a74784fed248b68034bf659e2d3b9d5 100644
--- a/paddle/fluid/prim/utils/static/static_global_utils.h
+++ b/paddle/fluid/prim/utils/static/static_global_utils.h
@@ -69,12 +69,36 @@ class StaticCompositeContext {
     enable_bwd_prim_ = enable_prim;
   }
 
+  size_t CheckSkipCompOps(const std::string& op_type) const {
+    return skip_comp_ops_.count(op_type);
+  }
+
+  void AddSkipCompOps(const std::string& op_type) {
+    skip_comp_ops_.insert(op_type);
+  }
+
+  void RemoveSkipCompOps(const std::string& op_type) {
+    skip_comp_ops_.erase(op_type);
+  }
+
+  void SetTargetGradName(const std::map<std::string, std::string>& m) {
+    target_grad_name_ = m;
+  }
+
+  std::map<std::string, std::string> GetTargetGradName() {
+    return target_grad_name_;
+  }
+
  private:
   StaticCompositeContext()
-      : current_block_desc_(nullptr), generator_(new UniqueNameGenerator()) {}
+      : current_block_desc_(nullptr),
+        generator_(new UniqueNameGenerator()),
+        skip_comp_ops_({"matmul_v2"}) {}
 
   framework::BlockDesc* current_block_desc_;
   std::unique_ptr<UniqueNameGenerator> generator_;
+  std::unordered_set<std::string> skip_comp_ops_;
+  std::map<std::string, std::string> target_grad_name_;
   static thread_local bool enable_bwd_prim_;
   static thread_local bool enable_fwd_prim_;
   static StaticCompositeContext* static_composite_context_;
diff --git a/paddle/fluid/prim/utils/utils.cc b/paddle/fluid/prim/utils/utils.cc
index fb415262c8d13e2e0ca297f98eda8288c5ceb53c..e76531616807a63d5880fb2f4ad04cdbe033ac8f 100644
--- a/paddle/fluid/prim/utils/utils.cc
+++ b/paddle/fluid/prim/utils/utils.cc
@@ -24,7 +24,7 @@ bool PrimCommonUtils::IsBwdPrimEnabled() {
 }
 
 void PrimCommonUtils::SetBwdPrimEnabled(bool enable_prim) {
-  return StaticCompositeContext::Instance().SetBwdPrimEnabled(enable_prim);
+  StaticCompositeContext::Instance().SetBwdPrimEnabled(enable_prim);
 }
 
 bool PrimCommonUtils::IsFwdPrimEnabled() {
@@ -32,11 +32,29 @@ bool PrimCommonUtils::IsFwdPrimEnabled() {
 }
 
 void PrimCommonUtils::SetFwdPrimEnabled(bool enable_prim) {
-  return StaticCompositeContext::Instance().SetFwdPrimEnabled(enable_prim);
+  StaticCompositeContext::Instance().SetFwdPrimEnabled(enable_prim);
 }
 
 void PrimCommonUtils::SetAllPrimEnabled(bool enable_prim) {
-  return StaticCompositeContext::Instance().SetAllPrimEnabled(enable_prim);
+  StaticCompositeContext::Instance().SetAllPrimEnabled(enable_prim);
 }
+
+size_t PrimCommonUtils::CheckSkipCompOps(const std::string& op_type) {
+  return StaticCompositeContext::Instance().CheckSkipCompOps(op_type);
+}
+
+void PrimCommonUtils::AddSkipCompOps(const std::string& op_type) {
+  StaticCompositeContext::Instance().AddSkipCompOps(op_type);
+}
+
+void PrimCommonUtils::RemoveSkipCompOps(const std::string& op_type) {
+  StaticCompositeContext::Instance().RemoveSkipCompOps(op_type);
+}
+
+void PrimCommonUtils::SetTargetGradName(
+    const std::map<std::string, std::string>& m) {
+  StaticCompositeContext::Instance().SetTargetGradName(m);
+}
+
 }  // namespace prim
 }  // namespace paddle
diff --git a/paddle/fluid/prim/utils/utils.h b/paddle/fluid/prim/utils/utils.h
index 38973dc87b8adf9408e0fc62dd85d11cad754551..8718496b3f1884a25f7605847250eb2a59a45e58 100644
--- a/paddle/fluid/prim/utils/utils.h
+++ b/paddle/fluid/prim/utils/utils.h
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
+#include <string>
+#include <unordered_set>
 
 namespace paddle {
 namespace prim {
@@ -23,6 +26,10 @@ class PrimCommonUtils {
   static bool IsFwdPrimEnabled();
   static void SetFwdPrimEnabled(bool enabled);
   static void SetAllPrimEnabled(bool enabled);
+  static size_t CheckSkipCompOps(const std::string& op_type);
+  static void AddSkipCompOps(const std::string& op_type);
+  static void RemoveSkipCompOps(const std::string& op_type);
+  static void SetTargetGradName(const std::map<std::string, std::string>& m);
 };
 }  // namespace prim
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 020a926b4739e5e5a63b53056b6200c4de360b88..29bf54823dba404915356e7fe838509b327646c3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -673,6 +673,8 @@ PYBIND11_MODULE(libpaddle, m) {
         &paddle::prim::PrimCommonUtils::IsFwdPrimEnabled);
   m.def("__set_all_prim_enabled",
         &paddle::prim::PrimCommonUtils::SetAllPrimEnabled);
+  m.def("_set_prim_target_grad_name",
+        &paddle::prim::PrimCommonUtils::SetTargetGradName);
   m.def("set_num_threads", &platform::SetNumThreads);
 
   m.def("disable_signal_handler", &DisableSignalHandler);
@@ -1244,6 +1246,9 @@ All parameter, weight, gradient are variables in Paddle.
         return static_cast<paddle::framework::proto::AttrType>(
             defalut_val.index() - 1);
       });
+  m.def("_add_skip_comp_ops", &paddle::prim::PrimCommonUtils::AddSkipCompOps);
+  m.def("_remove_skip_comp_ops",
+        &paddle::prim::PrimCommonUtils::RemoveSkipCompOps);
   m.def("get_grad_op_desc",
         [](const OpDesc &op_desc,
            const std::unordered_set<std::string> &no_grad_set,
@@ -1275,8 +1280,11 @@ All parameter, weight, gradient are variables in Paddle.
           // priority of CompGradOpMaker is less than GradCompMaker for better
           // performance.
           std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
+          auto need_skip =
+              paddle::prim::PrimCommonUtils::CheckSkipCompOps(op_desc.Type());
+          VLOG(3) << "need skip: " << need_skip << std::endl;
           if (paddle::prim::PrimCommonUtils::IsBwdPrimEnabled()) {
-            if (grad_comp_op_maker != nullptr) {
+            if ((grad_comp_op_maker != nullptr) && (!need_skip)) {
               VLOG(3) << "Runing composite fun for " << op_desc.Type();
               grad_op_descs = grad_comp_op_maker(op_desc,
                                                  no_grad_set,
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 23158d794019f6313762e1fcebb9aa09af204104..5b900da998c8cf572cece19bd0cbb114e86ef4a6 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -128,6 +128,19 @@
     func : bmm_grad
     data_type : out_grad
 
+- backward_op : broadcast_tensors_grad
+  forward : broadcast_tensors (Tensor[] input) -> Tensor[](out)
+  args : (Tensor[] input, Tensor[] out_grad)
+  output : Tensor[](input_grad)
+  infer_meta :
+    func : UnchangedMultiInferMeta
+    param : [input]
+  kernel :
+    func : broadcast_tensors_grad
+    param : [input, out_grad]
+    data_type : out_grad
+  no_need_buffer : input
+
 - backward_op : ceil_grad
   forward : ceil(Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
@@ -933,6 +946,43 @@
   kernel :
     func : poisson_grad
 
+- backward_op : pow_double_grad
+  forward : pow_grad(Tensor x, Tensor grad_out, Scalar y) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, Scalar y)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param: [x, grad_out]
+  kernel :
+    func : pow_double_grad
+    data_type : x
+  backward : pow_triple_grad
+  inplace : (grad_x_grad -> x_grad)
+
+- backward_op : pow_grad
+  forward : pow(Tensor x, Scalar y=1.0f) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, Scalar y=-1)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : pow_grad
+    data_type : out_grad
+  backward: pow_double_grad
+  inplace : (out_grad -> x_grad)
+
+- backward_op : pow_triple_grad
+  forward : pow_double_grad(Tensor x, Tensor grad_out, Tensor grad_grad_x, Scalar y) -> Tensor(grad_x), Tensor(grad_grad_out)
+  args : (Tensor x, Tensor grad_out, Tensor grad_grad_x, Tensor grad_x_grad, Tensor grad_grad_out_grad, Scalar y)
+  output : Tensor(x_grad), Tensor(grad_out_grad), Tensor(grad_grad_x_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param: [x, grad_out, grad_grad_x]
+  kernel :
+    func : pow_triple_grad
+    data_type : x
+
 - backward_op : put_along_axis_grad
   forward : put_along_axis (Tensor arr, Tensor indices, Tensor value, int axis, str reduce = "assign") -> Tensor(out)
   args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 615008a8291c5599132386119ffbf985559a2f7b..1bbfe4fc9635289c670b5b48d046fa2a38e7580f 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -175,18 +175,6 @@
   kernel :
     func : bilinear_tensor_product_grad
 
-- backward_op : broadcast_tensors_grad
-  forward : broadcast_tensors (Tensor[] input) -> Tensor[](out)
-  args : (Tensor[] input, Tensor[] out_grad)
-  output : Tensor[](input_grad)
-  infer_meta :
-    func : UnchangedMultiInferMeta
-    param : [input]
-  kernel :
-    func : broadcast_tensors_grad
-    param : [out_grad]
-  no_need_buffer : input
-
 - backward_op : cast_grad
   forward : cast (Tensor x, DataType dtype) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -1010,40 +998,6 @@
     func : pool3d_grad
     param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
 
-- backward_op : pow_double_grad
-  forward : pow_grad(Tensor x, Tensor grad_out, Scalar y) -> Tensor(grad_x)
-  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, Scalar y)
-  output : Tensor(x_grad), Tensor(grad_out_grad)
-  infer_meta :
-    func : GeneralBinaryGradInferMeta
-    param: [x, grad_out]
-  kernel :
-    func : pow_double_grad
-  backward : pow_triple_grad
-  inplace : (grad_x_grad -> x_grad)
-
-- backward_op : pow_grad
-  forward : pow(Tensor x, Scalar y) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, Scalar y=-1)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param: [x]
-  kernel :
-    func : pow_grad
-  backward: pow_double_grad
-  inplace : (out_grad -> x_grad)
-
-- backward_op : pow_triple_grad
-  forward : pow_double_grad(Tensor x, Tensor grad_out, Tensor grad_grad_x, Scalar y) -> Tensor(grad_x), Tensor(grad_grad_out)
-  args : (Tensor x, Tensor grad_out, Tensor grad_grad_x, Tensor grad_x_grad, Tensor grad_grad_out_grad, Scalar y)
-  output : Tensor(x_grad), Tensor(grad_out_grad), Tensor(grad_grad_x_grad)
-  infer_meta :
-    func : GeneralTernaryGradInferMeta
-    param: [x, grad_out, grad_grad_x]
-  kernel :
-    func : pow_triple_grad
-
 - backward_op : prelu_grad
   forward : prelu(Tensor x, Tensor alpha, str data_format, str mode) -> Tensor(out)
   args : (Tensor x, Tensor alpha, Tensor out_grad, str data_format, str mode)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 049d86473cfc5b5c2c4aea411d783914ced2d9c5..286e9841effb609349f3fb6eb3313bfa83821faf 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -318,15 +318,6 @@
     func : box_coder
   optional : prior_box_var
 
-- op : broadcast_tensors
-  args: (Tensor[] input)
-  output: Tensor[]{input.size()}
-  infer_meta:
-    func: BroadcastTensorsInferMeta
-  kernel:
-    func: broadcast_tensors
-  backward: broadcast_tensors_grad
-
 - op : cast
   args : (Tensor x, DataType dtype)
   output : Tensor
@@ -1365,17 +1356,6 @@
     param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
   backward : pool3d_grad
 
-- op : pow
-  args : (Tensor x, Scalar y)
-  output : Tensor(out)
-  infer_meta :
-    func : UnchangedInferMeta
-    param: [x]
-  kernel :
-    func : pow
-    data_type : x
-  backward : pow_grad
-
 - op : prelu
   args : (Tensor x, Tensor alpha, str data_format, str mode)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 1fc4144849b2ac048488f9cfb284ef3970d6a774..80db4ae909bba5840b7c3796ea1413a4756be5c7 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -164,6 +164,14 @@
   outputs :
     out : Out
 
+- op : broadcast_tensors
+  backward : broadcast_tensors_grad
+  inputs :
+    input : X
+  outputs :
+    out : Out
+  drop_empty_grad : [input_grad]
+
 - op : ceil
   backward : ceil_grad
   inputs :
@@ -1045,6 +1053,19 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : pow
+  backward : pow_grad, pow_double_grad, pow_triple_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    y : factor
+  scalar :
+    y :
+      data_type : float
+      tensor_name : FactorTensor
+
 - op : prelu
   backward : prelu_grad
   extra :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index df606ebec0734ea1220bb53ba501624bba94aba0..90ccf5549760026f6604f485c49598ef01ade4f3 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -134,6 +134,16 @@
     func : bmm
   backward : bmm_grad
 
+- op : broadcast_tensors
+  args: (Tensor[] input)
+  output: Tensor[]{input.size()}
+  infer_meta:
+    func: BroadcastTensorsInferMeta
+  kernel:
+    func: broadcast_tensors
+    data_type : input
+  backward: broadcast_tensors_grad
+
 - op : ceil
   args : (Tensor x)
   output : Tensor(out)
@@ -911,6 +921,17 @@
     func : poisson
   backward : poisson_grad
 
+- op : pow
+  args : (Tensor x, Scalar y=1.0f)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : pow
+    data_type : x
+  backward : pow_grad
+
 - op : put_along_axis
   args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign")
   output : Tensor(out)
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index c0e28a90e9ff32f5b9df613042fc4b5b4a988fed..164b425e393d66ab03007c5a139d7ba4df5e67d4 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -148,38 +148,41 @@ class CustomDevice : public DeviceInterface {
                         stream::Stream::Flag::kDefaultFlag) override {
     const auto device = &devices_pool[dev_id];
     C_Stream c_stream;
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
-        pimpl_->create_stream(device, &c_stream));
+    if (pimpl_->create_stream) {
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->create_stream(device, &c_stream));
+    } else {
+      c_stream = nullptr;
+    }
     stream->set_stream(c_stream);
   }
 
   void DestroyStream(size_t dev_id, stream::Stream* stream) override {
-    const auto device = &devices_pool[dev_id];
-
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_stream(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+    if (pimpl_->destroy_stream) {
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->destroy_stream(
+          device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+    }
   }
 
   void SynchronizeStream(size_t dev_id, const stream::Stream* stream) override {
-    const auto device = &devices_pool[dev_id];
-
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_stream(
-        device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+    if (pimpl_->synchronize_stream) {
+      const auto device = &devices_pool[dev_id];
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->synchronize_stream(
+          device, reinterpret_cast<C_Stream>(stream->raw_stream())));
+    }
   }
 
   bool QueryStream(size_t dev_id, const stream::Stream* stream) override {
-    const auto device = &devices_pool[dev_id];
-
     if (!pimpl_->query_stream) {
       SynchronizeStream(dev_id, stream);
       return true;
+    } else {
+      const auto device = &devices_pool[dev_id];
+      return pimpl_->query_stream(
+                 device, reinterpret_cast<C_Stream>(stream->raw_stream())) ==
+             C_SUCCESS;
     }
-    if (pimpl_->query_stream(
-            device, reinterpret_cast<C_Stream>(stream->raw_stream())) ==
-        C_SUCCESS) {
-      return true;
-    }
-    return false;
   }
 
   void AddCallback(size_t dev_id,
@@ -259,12 +262,14 @@ class CustomDevice : public DeviceInterface {
   void StreamWaitEvent(size_t dev_id,
                        const stream::Stream* stream,
                        const event::Event* event) override {
-    const auto device = &devices_pool[dev_id];
+    if (pimpl_->stream_wait_event) {
+      const auto device = &devices_pool[dev_id];
 
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
-        device,
-        reinterpret_cast<C_Stream>(stream->raw_stream()),
-        reinterpret_cast<C_Event>(event->raw_event())));
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event(
+          device,
+          reinterpret_cast<C_Stream>(stream->raw_stream()),
+          reinterpret_cast<C_Event>(event->raw_event())));
+    }
   }
 
   void MemoryCopyH2D(size_t dev_id,
@@ -279,7 +284,7 @@ class CustomDevice : public DeviceInterface {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
-    } else {
+    } else if (pimpl_->memory_copy_h2d) {
       paddle::platform::DeviceContextPool& pool =
           paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
@@ -300,7 +305,7 @@ class CustomDevice : public DeviceInterface {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
-    } else {
+    } else if (pimpl_->memory_copy_d2h) {
       paddle::platform::DeviceContextPool& pool =
           paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
@@ -321,7 +326,7 @@ class CustomDevice : public DeviceInterface {
       C_Stream c_stream = reinterpret_cast<C_Stream>(stream->raw_stream());
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
-    } else {
+    } else if (pimpl_->memory_copy_d2d) {
       paddle::platform::DeviceContextPool& pool =
           paddle::platform::DeviceContextPool::Instance();
       pool.Get(place)->Wait();
@@ -455,24 +460,33 @@ class CustomDevice : public DeviceInterface {
   }
 
   void MemoryStats(size_t dev_id, size_t* total, size_t* free) override {
-    const auto device = &devices_pool[dev_id];
+    if (pimpl_->device_memory_stats) {
+      const auto device = &devices_pool[dev_id];
 
-    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
-        pimpl_->device_memory_stats(device, total, free));
+      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+          pimpl_->device_memory_stats(device, total, free));
 
-    size_t used = *total - *free;
-    VLOG(10) << Type() << " memory usage " << (used >> 20) << "M/"
-             << (*total >> 20) << "M, " << (*free >> 20)
-             << "M available to allocate";
+      size_t used = *total - *free;
+      VLOG(10) << Type() << " memory usage " << (used >> 20) << "M/"
+               << (*total >> 20) << "M, " << (*free >> 20)
+               << "M available to allocate";
+    } else {
+      *total = 0;
+      *free = 0;
+    }
   }
 
   size_t GetMinChunkSize(size_t dev_id) override {
-    const auto device = &devices_pool[dev_id];
+    if (pimpl_->device_min_chunk_size) {
+      const auto device = &devices_pool[dev_id];
 
-    size_t size = 0;
-    pimpl_->device_min_chunk_size(device, &size);
-    VLOG(10) << Type() << " min chunk size " << size << "B";
-    return size;
+      size_t size = 0;
+      pimpl_->device_min_chunk_size(device, &size);
+      VLOG(10) << Type() << " min chunk size " << size << "B";
+      return size;
+    } else {
+      return 1;
+    }
   }
 
   size_t GetMaxChunkSize(size_t dev_id) override {
@@ -911,8 +925,8 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
   CHECK_INTERFACE(get_device, true);
   CHECK_INTERFACE(deinit_device, false);
 
-  CHECK_INTERFACE(create_stream, true);
-  CHECK_INTERFACE(destroy_stream, true);
+  CHECK_INTERFACE(create_stream, false);
+  CHECK_INTERFACE(destroy_stream, false);
   CHECK_INTERFACE(query_stream, false);
   CHECK_INTERFACE(stream_add_callback, false);
 
@@ -922,9 +936,9 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
   CHECK_INTERFACE(query_event, false);
 
   CHECK_INTERFACE(synchronize_device, false);
-  CHECK_INTERFACE(synchronize_stream, true);
+  CHECK_INTERFACE(synchronize_stream, false);
   CHECK_INTERFACE(synchronize_event, true);
-  CHECK_INTERFACE(stream_wait_event, true);
+  CHECK_INTERFACE(stream_wait_event, false);
 
   CHECK_INTERFACE(device_memory_allocate, true);
   CHECK_INTERFACE(device_memory_deallocate, true);
@@ -932,9 +946,9 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
   CHECK_INTERFACE(host_memory_deallocate, false);
   CHECK_INTERFACE(unified_memory_allocate, false);
   CHECK_INTERFACE(unified_memory_deallocate, false);
-  CHECK_INTERFACE(memory_copy_h2d, true);
-  CHECK_INTERFACE(memory_copy_d2h, true);
-  CHECK_INTERFACE(memory_copy_d2d, true);
+  CHECK_INTERFACE(memory_copy_h2d, false);
+  CHECK_INTERFACE(memory_copy_d2h, false);
+  CHECK_INTERFACE(memory_copy_d2d, false);
   CHECK_INTERFACE(memory_copy_p2p, false);
   CHECK_INTERFACE(async_memory_copy_h2d, false);
   CHECK_INTERFACE(async_memory_copy_d2h, false);
@@ -943,9 +957,9 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
 
   CHECK_INTERFACE(get_device_count, true);
   CHECK_INTERFACE(get_device_list, true);
-  CHECK_INTERFACE(device_memory_stats, true);
+  CHECK_INTERFACE(device_memory_stats, false);
 
-  CHECK_INTERFACE(device_min_chunk_size, true);
+  CHECK_INTERFACE(device_min_chunk_size, false);
   CHECK_INTERFACE(device_max_chunk_size, false);
   CHECK_INTERFACE(device_max_alloc_size, false);
   CHECK_INTERFACE(device_extra_padding_size, false);
diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h
index c398138e2d5fa06ae6c35ca7901bb925689dcbb9..cb3f59036ee2ac364b271a839574ca9cfba656cc 100644
--- a/paddle/phi/backends/onednn/onednn_reuse.h
+++ b/paddle/phi/backends/onednn/onednn_reuse.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -53,26 +53,31 @@ constexpr bool is_bfloat16() {
 
 static void AppendActivation(const OneDNNContext& dev_ctx,
                              dnnl::post_ops& post_ops,  // NOLINT
-                             float activation_scale = 1.0f) {
-  const auto invalid_attribute =
-      dev_ctx.HasDnnAttr("fuse_activation")
-          ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"))
-                .empty()
-          : true;
-  if (invalid_attribute) return;
-
-  const auto fuse_activation =
-      dev_ctx.HasDnnAttr("fuse_activation")
-          ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"))
-          : "";
-  const auto fuse_alpha =
-      dev_ctx.HasDnnAttr("fuse_alpha")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_alpha"))
-          : 0.0f;
-  const auto fuse_beta =
-      dev_ctx.HasDnnAttr("fuse_beta")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_beta"))
-          : 0.0f;
+                             float activation_scale = 1.0f,
+                             std::string fuse_activation = "",
+                             float fuse_alpha = 0.0f,
+                             float fuse_beta = 0.0f) {
+  if (fuse_activation == "") {
+    const auto invalid_attribute =
+        dev_ctx.HasDnnAttr("fuse_activation")
+            ? PADDLE_GET_CONST(std::string,
+                               dev_ctx.GetDnnAttr("fuse_activation"))
+                  .empty()
+            : true;
+    if (invalid_attribute) return;
+
+    fuse_activation =
+        dev_ctx.HasDnnAttr("fuse_activation")
+            ? PADDLE_GET_CONST(std::string,
+                               dev_ctx.GetDnnAttr("fuse_activation"))
+            : "";
+    fuse_alpha = dev_ctx.HasDnnAttr("fuse_alpha")
+                     ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_alpha"))
+                     : 0.0f;
+    fuse_beta = dev_ctx.HasDnnAttr("fuse_beta")
+                    ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_beta"))
+                    : 0.0f;
+  }
 
   if (fuse_activation == "hard_sigmoid") {
     post_ops.append_eltwise(activation_scale,
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 67ac2b17a709493b6bcfbab420a00f89bfd48495..b3635652ffd1014b911772e2abf3daa9c8b9b785 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -313,6 +313,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::INT64,
                      phi::DataType::INT32,
                      phi::DataType::FLOAT32})},
+      {"grid_sampler_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"grid_sampler", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_sigmoid", XPUKernelSet({phi::DataType::FLOAT32})},
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 3b3202c2917251da1f368f8d8b60da78b9e83b6c..f2fcb3162081fb871117c39c2ed02d2334404bcc 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -160,22 +160,34 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
   auto int_axis = axis.to<int64_t>();
   const auto& x_dims = x.dims();
 
-  PADDLE_ENFORCE_GE(
-      int_axis,
-      -x_dims.size(),
-      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
-                                   " -Rank(X)(%d).",
-                                   int_axis,
-                                   -x_dims.size()));
-  PADDLE_ENFORCE_LT(int_axis,
-                    x_dims.size(),
-                    phi::errors::InvalidArgument(
-                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
-                        int_axis,
-                        x_dims.size()));
+  auto x_rank = x.dims().size();
+  if (x_rank > 0) {
+    PADDLE_ENFORCE_GE(int_axis,
+                      -x_rank,
+                      phi::errors::InvalidArgument(
+                          "'axis'(%d) must be greater than or equal to"
+                          " -Rank(X)(%d).",
+                          int_axis,
+                          -x_rank));
+    PADDLE_ENFORCE_LT(
+        int_axis,
+        x_rank,
+        phi::errors::InvalidArgument(
+            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+            int_axis,
+            x_rank));
+  } else {
+    // 0-dim tensor
+    PADDLE_ENFORCE_EQ((int_axis == 0 || int_axis == -1) && flatten,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "'axis'(%d) must be 0 or -1 if input tensor is "
+                          "0-dim. and flatten should be true.",
+                          int_axis));
+  }
 
-  auto x_rank = x_dims.size();
   if (int_axis < 0) int_axis += x_rank;
+
   if (config.is_runtime) {
     if (dtype == phi::TransToProtoVarType(DataType::INT32)) {
       int64_t all_element_num = 0;
@@ -195,8 +207,12 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
               INT_MAX));
     }
   }
+
   std::vector<int64_t> vec;
-  if (flatten) {
+
+  if (x_rank == 0) {
+    // vec is set to empty
+  } else if (flatten) {
     vec.emplace_back(static_cast<int64_t>(1));
   } else {
     for (int64_t i = 0; i < int_axis; i++) vec.emplace_back(x_dims[i]);
@@ -205,6 +221,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
     }
     for (int64_t i = int_axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
   }
+
   out->set_dims(phi::make_ddim(vec));
   if (dtype == 2) {
     out->set_dtype(DataType::INT32);
@@ -3378,6 +3395,21 @@ void SliceRawInferMeta(const MetaTensor& input,
     }
   }
 
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      starts_arr.size(),
+      phi::errors::InvalidArgument(
+          "The length of axes (%d) and length of starts (%d) should be same.",
+          axes.size(),
+          starts_arr.size()));
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      ends_arr.size(),
+      phi::errors::InvalidArgument(
+          "The length of axes (%d) and length of ends (%d) should be same.",
+          axes.size(),
+          ends_arr.size()));
+
   // 2.1 Check attrs.
   std::vector<int64_t> starts = starts_arr.GetData();
   std::vector<int64_t> ends = ends_arr.GetData();
@@ -4253,7 +4285,20 @@ void UnbindInferMeta(const MetaTensor& x,
                      std::vector<MetaTensor*> outs) {
   auto in_dims = x.dims();
   std::vector<int> out_dim;
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      -in_dims.size(),
+      phi::errors::InvalidArgument(
+          "axis must be in range(%d, %d).", -in_dims.size(), in_dims.size()));
+  PADDLE_ENFORCE_LT(
+      axis,
+      in_dims.size(),
+      phi::errors::InvalidArgument(
+          "axis must be in range(%d, %d).", -in_dims.size(), in_dims.size()));
+
   axis = axis < 0 ? in_dims.size() + axis : axis;
+
   for (int i = 0; i < in_dims.size(); ++i) {
     if (i != axis) out_dim.push_back(in_dims[i]);
   }
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
index 79d5b8a445b48eb320f80341b9c8863186f97de6..087e29c23f0e19d12d0e237cb2d5b6a724a5080c 100644
--- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -22,6 +22,8 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<const DenseTensor*>&
+                                    inputs,  // just for aligning to infershape
                                 const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx);
 
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
index 17246de35db22c079a0bcad3598b172abe6ea808..2c9312e63ac89994ed19d1dc77dd36bc20c3e7be 100644
--- a/paddle/phi/kernels/cpu/accuracy_kernel.cc
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -35,10 +35,33 @@ void AccuracyRawKernel(const Context& dev_ctx,
   const int64_t* indices_data = indices.data<int64_t>();
   const int64_t* label_data = label.data<int64_t>();
 
+  PADDLE_ENFORCE_EQ(
+      inference.dims().size(),
+      2,
+      phi::errors::InvalidArgument(
+          "Rank(Input) of AccuracyOp must be 2, with shape "
+          "[sample_number, class_dim], But received rank(Input) is %d",
+          inference.dims().size()));
+
   size_t num_samples = inference.dims()[0];
   size_t class_dim = inference.dims()[1];
   *accuracy_data = 0.0f;
 
+  PADDLE_ENFORCE_GT(label.dims().size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Rank(Label) of AccuracyOp must greater than 0, "
+                        "But received rank(Label) is %d",
+                        label.dims().size()));
+
+  PADDLE_ENFORCE_GE(
+      label.dims()[0],
+      inference.dims()[0],
+      phi::errors::InvalidArgument("num_samples(%d) of Label should less than "
+                                   "or equal to num_samples(%d) of Input",
+                                   label.dims()[0],
+                                   num_samples));
+
   if (num_samples == 0) {
     return;
   }
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
index 61d20ac32f15af65dd97ce80b491b1ce5e7888ae..694698050a0c06cf1fbd4452ead9c25e542ecaa5 100644
--- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -96,6 +96,12 @@ struct VisitDataArgMinMaxFunctor {
       if (axis < 0) new_axis = axis + x_dims.size();
     }
 
+    // For 0D Tensor
+    if (x.dims().size() == 0) {
+      phi::funcs::set_constant(dev_ctx, out, 0);
+      return;
+    }
+
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                         \
   ArgMinMaxFunctor<Context, T, Tout, rank, EnumArgMinMaxValue> functor##rank; \
   functor##rank(dev_ctx, x, out, x_dims, new_axis, new_keepdims)
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 413638e17722251762360a0e9919c7c52d3d0df7..0d549ae46e2170ef202a323417c073a9b631e2e9 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -60,8 +60,10 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<const DenseTensor*>& inputs,
                                 const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx) {
+  (void)inputs;
   // Find reduce dimensions
   const auto& in_tensors = dout;
   auto& out_tensors = dx;
diff --git a/paddle/phi/kernels/cpu/eig_kernel.cc b/paddle/phi/kernels/cpu/eig_kernel.cc
index 42a843391872ff1acf83d5bcdab3e6296939b38c..c9bdf8af1168270b41ea4c8f642252c68acf9812 100644
--- a/paddle/phi/kernels/cpu/eig_kernel.cc
+++ b/paddle/phi/kernels/cpu/eig_kernel.cc
@@ -31,6 +31,11 @@ void EigKernel(const Context& dev_ctx,
     int batch_count = BatchCount(x);
     int order = x.dims()[x.dims().size() - 1];
 
+    PADDLE_ENFORCE_LT(0,
+                      order,
+                      errors::InvalidArgument(
+                          "The order of Input(X) should be greater than 0."));
+
     DenseTensor real_w;
     DenseTensor real_v;
 
diff --git a/paddle/phi/kernels/cpu/p_norm_kernel.cc b/paddle/phi/kernels/cpu/p_norm_kernel.cc
index 597939953b27713cf2c9a6caf0dc1577a0421d8d..bb33b8a397e02d1bf9e22f161a6165840d2e8155 100644
--- a/paddle/phi/kernels/cpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_kernel.cc
@@ -61,6 +61,13 @@ void PNormKernel(const Context& dev_ctx,
   int pre, n, post;
   GetDims(xdim, axis, &pre, &n, &post, asvector);
 
+  for (int i = 0; i < xdim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      xdim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   auto* place = dev_ctx.eigen_device();
 
   Eigen::DSizes<int, 3> shape(pre, n, post);
diff --git a/paddle/phi/kernels/cpu/unique_consecutive_functor.h b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
index 314c371bf7a64268281d2d6511cf832440f31221..73d196bbb98d96f90b4b0627bdf7306f4633cbd0 100644
--- a/paddle/phi/kernels/cpu/unique_consecutive_functor.h
+++ b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
@@ -51,9 +51,11 @@ static void UniqueConsecutiveFlattenedTensor(const Context& context,
     }
   }
 
-  int64_t output_size = p - out_vec.data() + 1;
+  bool is_empty = in.numel() == 0;
+  int64_t output_size = is_empty ? 0 : (p - out_vec.data() + 1);
+
   if (return_counts) {
-    *q = in.numel() - last;
+    if (!is_empty) *q = in.numel() - last;
     counts_vec.resize(output_size);
   }
   out_vec.resize(output_size);
diff --git a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
index 2e1dd3e4ecbf17ca620f5e8d0476b78aa263cd87..560578ed22843e35c3404969326ef744901bace0 100644
--- a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
@@ -51,6 +51,7 @@ void UniqueConsecutiveKernel(const Context& dev_ctx,
             dev_ctx, x, out, return_inverse, return_counts, index, counts));
   } else {
     int valid_axis = axis[0];
+    if (valid_axis < 0) valid_axis += x.dims().size();
     phi::VisitDataTypeTiny(
         data_type,
         UniqueConsecutiveDimFunctor<Context, T>(dev_ctx,
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index e4f779c807570ea107496e1fce235c30625a910c..da8f47c7bffd469f9632f4444b713b62dd35cede 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -27,7 +27,7 @@ math_library(sequence_scale)
 cc_library(
   phi_data_layout_transform
   SRCS data_layout_transform.cc
-  DEPS tensor)
+  DEPS tensor blas)
 
 if(WITH_GPU OR WITH_ROCM)
   if(MKL_FOUND AND WITH_ONEMKL)
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index fa663528eb0158fa75f8d3e86f83115621491816..dc9150e4f2c565b8d6b03e4120bd9ba19d874f3c 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/segmented_array.h"
 
 namespace phi {
 namespace funcs {
@@ -45,6 +45,12 @@ static inline void GetBlockDims(const phi::GPUContext& context,
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x)
+#endif
+
 template <typename T, int Size>
 struct PointerWrapper {
  public:
@@ -55,12 +61,29 @@ struct PointerWrapper {
   PointerWrapper(const phi::GPUContext& ctx,
                  const std::vector<phi::DenseTensor>& ins,
                  const T** pre_alloced_host_ptr) {
+    SetInputAddr(ins);
+  }
+
+ protected:
+  void SetInputAddr(const std::vector<phi::DenseTensor>& ins) {
     for (auto i = 0; i < ins.size(); ++i) {
       ins_addr[i] = ins[i].data();
     }
   }
 };
 
+template <typename T, int Size>
+struct PADDLE_ALIGN(256) AlignedPointerWrapper
+    : public PointerWrapper<T, Size> {
+ public:
+  AlignedPointerWrapper() {}
+  AlignedPointerWrapper(const phi::GPUContext& ctx,
+                        const std::vector<phi::DenseTensor>& ins,
+                        const T** pre_alloced_host_ptr) {
+    this->SetInputAddr(ins);
+  }
+};
+
 template <typename T>
 struct PointerToPointer {
  public:
@@ -93,7 +116,7 @@ struct PointerToPointer {
 };
 
 template <typename T, typename IndexT, int Size>
-struct PointerAndColWrapper {
+struct PADDLE_ALIGN(256) PointerAndColWrapper {
  public:
   IndexT col_length[Size];
   PointerAndColWrapper(const phi::GPUContext& ctx,
@@ -151,6 +174,8 @@ struct PointerToPointerAndCol {
   PointerToPointer<T> ins_ptr_wrapper;
 };
 
+#undef PADDLE_ALIGN
+
 template <int MovSize>
 struct alignas(MovSize) Packed {
   __device__ Packed() {
@@ -358,10 +383,10 @@ void DispatchConcatWithSameShapeKernelLimitNum(
   dim3 grid_dims;
   GetBlockDims(ctx, out_row, out_col, &block_dims, &grid_dims);
 
-#define IMPL_CONCAT_CUDA_KERNEL_CASE(size_, ...)               \
-  case size_: {                                                \
-    PointerWrapper<T, size_> ptr_array(ctx, ins, inputs_data); \
-    __VA_ARGS__;                                               \
+#define IMPL_CONCAT_CUDA_KERNEL_CASE(size_, ...)                      \
+  case size_: {                                                       \
+    AlignedPointerWrapper<T, size_> ptr_array(ctx, ins, inputs_data); \
+    __VA_ARGS__;                                                      \
   } break;
 
   switch (phi::backends::gpu::RoundToNextHighPowOfTwo(limit_num, 4)) {
@@ -519,108 +544,6 @@ void DispatchConcatKernel(const phi::GPUContext& ctx,
   }
 }
 
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t* out_cols,
-                             int out_cols_size,
-                             T** outputs_data) {
-  int64_t curr_segment = 0;
-  int64_t curr_offset = out_cols[0];
-  CUDA_KERNEL_LOOP_TYPE(tid_x, in_col, int64_t) {
-    int64_t curr_col_offset = out_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = out_cols[curr_segment + 1];
-    }
-
-    int64_t local_col = tid_x - curr_offset;
-    int64_t segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs_data[curr_segment];
-    if (output_ptr != nullptr) {
-      int64_t tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * segment_width + local_col] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__device__ void SplitKernelDetail(const T* input_data,
-                                  const int64_t in_row,
-                                  const int64_t in_col,
-                                  const int64_t fixed_out_col,
-                                  T** outputs_data) {
-  CUDA_KERNEL_LOOP_TYPE(tid_x, in_col, int64_t) {
-    int64_t split = tid_x / fixed_out_col;
-    int64_t in_offset = tid_x - split * fixed_out_col;
-    T* output_ptr = outputs_data[split];
-    if (output_ptr != nullptr) {
-      int64_t tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * fixed_out_col + in_offset] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T** outputs_data) {
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1) {
-  T* outputs_data[2];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2) {
-  T* outputs_data[3];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2,
-                             T* outputs_addr3) {
-  T* outputs_data[4];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  outputs_data[3] = outputs_addr3;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
 /*
  * All tensors' dimension should be the same and the values of
  * each dimension must be the same, except the axis dimension.
@@ -708,37 +631,152 @@ struct ConcatFunctor<phi::GPUContext, T> {
   }
 };
 
-template <typename T>
-class SplitFunctor<phi::GPUContext, T> {
+template <typename T, typename IndexT, funcs::SegmentedArraySize Size>
+struct PointerAndColArray
+    : public funcs::PointerArraySetter<phi::GPUContext, T, Size> {
  public:
-  void operator()(const phi::GPUContext& context,
-                  const phi::DenseTensor& input,
-                  const std::vector<const phi::DenseTensor*>& ref_inputs,
-                  int axis,
-                  std::vector<phi::DenseTensor*>* outputs) {
-    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-    // tensors of shape [0,1,4]
-    if (input.numel() == 0) {
-      return;
+  funcs::ValueArray<IndexT, Size> val_array;
+
+  PointerAndColArray() {}
+  PointerAndColArray(const phi::GPUContext& ctx,
+                     const int out_col_num,
+                     IndexT* out_cols,
+                     std::vector<DenseTensor*>* t,
+                     T** pre_alloc_host_buf = nullptr)
+      : funcs::PointerArraySetter<phi::GPUContext, T, Size>(
+            ctx,
+            t,
+            /*need_alloc=*/false,
+            /*use_cuda_graph=*/true,
+            pre_alloc_host_buf) {
+    IndexT* dev_ptr = nullptr;
+    if (Size == SegmentedArraySize::kVariableLength) {
+      size_t num_bytes = out_col_num * sizeof(IndexT);
+      dev_ptr = reinterpret_cast<IndexT*>(this->AllocAndCopy(
+          ctx, reinterpret_cast<void*>(out_cols), num_bytes, true));
+      val_array.Set(dev_ptr, out_col_num);
+    } else {
+      val_array.Set(out_cols, out_col_num);
+    }
+  }
+};
+
+template <typename T, typename IndexT, typename DataArrayT>
+__global__ void SplitTensorWithSameShape(const T* input_data,
+                                         const IndexT out_row,
+                                         const IndexT cumulative_col,
+                                         const IndexT fixed_out_col,
+                                         DataArrayT data_array) {
+  CUDA_KERNEL_LOOP_TYPE(tid_x, cumulative_col, IndexT) {
+    IndexT split = tid_x / fixed_out_col;
+    IndexT in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = data_array.data[split];
+    if (output_ptr != nullptr) {
+      IndexT tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < out_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * cumulative_col + tid_x];
+    }
+  }
+}
+
+template <typename T, typename IndexT, typename DataArrayT, typename ValArrayT>
+__global__ void SplitTensorWithDifferentShape(const T* input_data,
+                                              const IndexT out_row,
+                                              const IndexT cumulative_col,
+                                              DataArrayT data_array,
+                                              ValArrayT col_array) {
+  IndexT curr_segment = 0;
+  IndexT curr_offset = col_array.data[0];
+  CUDA_KERNEL_LOOP_TYPE(tid_x, cumulative_col, IndexT) {
+    IndexT curr_col_offset = col_array.data[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = col_array.data[curr_segment + 1];
     }
 
-    // TODO(zcd): Add input data validity checking
-    int o_num = outputs->size();
-    int64_t out_row = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      out_row *= dim_0[i];
+    IndexT local_col = tid_x - curr_offset;
+    IndexT segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = data_array.data[curr_segment];
+    if (output_ptr != nullptr) {
+      IndexT tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < out_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * cumulative_col + tid_x];
     }
+  }
+}
+
+template <typename T, typename IndexT, funcs::SegmentedArraySize Size>
+void SplitFunctionDispatchWithSameShape(const phi::GPUContext& ctx,
+                                        const IndexT out_col,
+                                        const IndexT out_row,
+                                        const IndexT cumulative_col,
+                                        const T* input_data,
+                                        std::vector<phi::DenseTensor*>* outs,
+                                        T** pre_alloc_host_buf) {
+  dim3 grid_dims;
+  dim3 block_dims;
+  GetBlockDims(ctx, out_row, cumulative_col, &block_dims, &grid_dims);
+
+  funcs::PointerArraySetter<phi::GPUContext, T, Size> setter(
+      ctx,
+      outs,
+      /*need_alloc=*/false,
+      /*use_cuda_graph=*/true,
+      pre_alloc_host_buf);
+  SplitTensorWithSameShape<T, IndexT, decltype(setter.array)>
+      <<<grid_dims, block_dims, 0, ctx.stream()>>>(
+          input_data, out_row, cumulative_col, out_col, setter.array);
+}
+
+template <typename T, typename IndexT, funcs::SegmentedArraySize Size>
+void SplitFunctionDispatchWithDifferentShape(
+    const phi::GPUContext& ctx,
+    const int out_col_num,
+    const IndexT out_row,
+    const IndexT cumulative_col,
+    const T* input_data,
+    std::vector<phi::DenseTensor*>* outs,
+    IndexT* output_cols,
+    T** pre_alloc_host_buf) {
+  dim3 grid_dims;
+  dim3 block_dims;
+  GetBlockDims(ctx, out_row, cumulative_col, &block_dims, &grid_dims);
+  PointerAndColArray<T, IndexT, Size> setter(
+      ctx, out_col_num, output_cols, outs, pre_alloc_host_buf);
+
+  SplitTensorWithDifferentShape<T,
+                                IndexT,
+                                decltype(setter.array),
+                                decltype(setter.val_array)>
+      <<<grid_dims, block_dims, 0, ctx.stream()>>>(
+          input_data, out_row, cumulative_col, setter.array, setter.val_array);
+}
 
-    int64_t out0_col = ref_inputs[0]->numel() / out_row;
-    int64_t in_col = 0, in_row = out_row;
-    bool has_same_shape = true;
+template <typename T, typename IndexT>
+void SplitFunctorDispatchWithIndexType(
+    const phi::GPUContext& ctx,
+    int axis,
+    const phi::DenseTensor& input,
+    const std::vector<const phi::DenseTensor*>& ref_ins,
+    std::vector<phi::DenseTensor*>* outs) {
+  // TODO(zcd): Add input data validity checking
+  int out_num = outs->size();
+  IndexT out_row = 1;
+  auto ref_dim = ref_ins[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    out_row *= ref_dim[i];
+  }
+  IndexT out_col = ref_ins[0]->numel() / out_row;
+  IndexT cumulative_col = 0;
+  bool has_same_shape = true;
 
-    int outputs_cols_num = o_num + 1;
-    std::vector<T*> outputs_data_vec(o_num);
-    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-    T** outputs_data = outputs_data_vec.data();
-    int64_t* outputs_cols = outputs_cols_vec.data();
+  int out_cols_num = out_num + 1;
+  std::vector<IndexT> outputs_cols_vec(out_cols_num, 0);
+  IndexT* outs_cols = outputs_cols_vec.data();
+  T** outs_data = nullptr;
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -748,128 +786,90 @@ class SplitFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-    paddle::memory::AllocationPtr data_alloc, cols_alloc;
-    // TODO(chentianyu03): try to find a method to remove the Alloc function
-    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                       o_num * sizeof(T*));
-    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-    // TODO(chentianyu03): try to find a method to remove the Alloc function
-    cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                       (outputs_cols_num) * sizeof(int64_t));
-    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+  paddle::memory::AllocationPtr data_alloc, cols_alloc;
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                     out_num * sizeof(T*));
+  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                     (out_cols_num) * sizeof(IndexT));
+  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
 #endif
 
-    outputs_cols[0] = 0;
-    for (int i = 0; i < o_num; ++i) {
-      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
-      if (has_same_shape) {
-        if (t_col != out0_col) has_same_shape = false;
-      }
-      in_col += t_col;
-      outputs_cols[i + 1] = in_col;
-      if (outputs->at(i) != nullptr) {
-        outputs_data[i] = outputs->at(i)->data<T>();
-      } else {
-        outputs_data[i] = nullptr;
-      }
+  outs_cols[0] = 0;
+  for (int i = 0; i < out_num; ++i) {
+    IndexT t_col = ref_ins.at(i)->numel() / out_row;
+    if (has_same_shape) {
+      has_same_shape &= (t_col == cumulative_col);
     }
-
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
-
-    paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
-    T** dev_out_gpu_data = nullptr;
-    if (!has_same_shape || o_num < 2 || o_num > 4) {
-      // TODO(chentianyu03): try to find a method to remove the Alloc function
-      tmp_dev_outs_data = paddle::memory::Alloc(
-          context.GetPlace(),
-          o_num * sizeof(T*),
-          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          outputs_data, o_num);
-      paddle::memory::Copy(context.GetPlace(),
-                           tmp_dev_outs_data->ptr(),
-                           phi::CPUPlace(),
-                           restored,
-                           o_num * sizeof(T*),
-                           context.stream());
-      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+    cumulative_col += t_col;
+    outs_cols[i + 1] = cumulative_col;
+  }
+  int limit_num = has_same_shape ? out_num : out_cols_num;
+  if (has_same_shape) {
+    switch (funcs::CalcArraySize(limit_num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          SplitFunctionDispatchWithSameShape<T, IndexT, kArraySize>(
+              ctx,
+              out_col,
+              out_row,
+              cumulative_col,
+              input.data<T>(),
+              outs,
+              outs_data));
     }
-
-    if (has_same_shape) {
-      if (o_num == 2) {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(),
-            in_row,
-            in_col,
-            out0_col,
-            outputs_data[0],
-            outputs_data[1]);
-      } else if (o_num == 3) {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(),
-            in_row,
-            in_col,
-            out0_col,
-            outputs_data[0],
-            outputs_data[1],
-            outputs_data[2]);
-      } else if (o_num == 4) {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(),
-            in_row,
-            in_col,
-            out0_col,
-            outputs_data[0],
-            outputs_data[1],
-            outputs_data[2],
-            outputs_data[3]);
-      } else {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
-      }
-    } else {
-      auto tmp_dev_ins_col_data =
-          // TODO(chentianyu03): try to find a method to remove the Alloc
-          // function
-          paddle::memory::Alloc(
-              context.GetPlace(),
-              outputs_cols_num * sizeof(int64_t),
-              phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          outputs_cols, outputs_cols_num);
-      paddle::memory::Copy(context.GetPlace(),
-                           tmp_dev_ins_col_data->ptr(),
-                           phi::CPUPlace(),
-                           restored,
-                           outputs_cols_num * sizeof(int64_t),
-                           context.stream());
-      int64_t* dev_outs_col_data =
-          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          dev_outs_col_data,
-          static_cast<int>(outputs_cols_num),
-          dev_out_gpu_data);
+  } else {
+    switch (funcs::CalcArraySize(limit_num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          SplitFunctionDispatchWithDifferentShape<T, IndexT, kArraySize>(
+              ctx,
+              out_cols_num,
+              out_row,
+              cumulative_col,
+              input.data<T>(),
+              outs,
+              outs_cols,
+              outs_data));
     }
+  }
 
 #ifdef PADDLE_WITH_HIP
-    // Prevent the pinned memory value from being covered and release the memory
-    // after the launch kernel of the stream is executed (reapply pinned memory
-    // next time)
-    auto* data_alloc_released = data_alloc.release();
-    auto* cols_alloc_released = cols_alloc.release();
-    context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-      paddle::memory::allocation::Allocator::AllocationDeleter(
-          data_alloc_released);
-      paddle::memory::allocation::Allocator::AllocationDeleter(
-          cols_alloc_released);
-    });
+  // Prevent pinned memory from being covered and release the memory after
+  // kernel launch of the stream is executed (reapply pinned memory next time)
+  auto* data_alloc_released = data_alloc.release();
+  auto* cols_alloc_released = cols_alloc.release();
+  ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        data_alloc_released);
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        cols_alloc_released);
+  });
 #endif
+}
+
+template <typename T>
+class SplitFunctor<phi::GPUContext, T> {
+ public:
+  void operator()(const phi::GPUContext& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    int64_t numel = input.numel();
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in
+    // 3 tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
+    if (numel < std::numeric_limits<int32_t>::max()) {
+      SplitFunctorDispatchWithIndexType<T, int32_t>(
+          context, axis, input, ref_inputs, outputs);
+    } else {
+      SplitFunctorDispatchWithIndexType<T, int64_t>(
+          context, axis, input, ref_inputs, outputs);
+    }
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index b9ffb4e3f123782d334bdd258811dfaa7b52e8cd..f577f1781ff09f2c869192b2262e0f3f248d083a 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -1530,37 +1530,31 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   ComputeBroadcastKernelSize(
       y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
 
-  auto x_strides_array_tmp = paddle::memory::Alloc(
+  // One part buffer for x_strides_array, rest for y_strides_array and
+  // out_dims_array.
+  size_t tmp_total_bytes = bytes * 3;
+  auto tmp_buffer = paddle::memory::Alloc(
       ctx.GetPlace(),
-      bytes,
+      tmp_total_bytes,
       phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *x_strides_array_gpu =
-      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
+  int *x_strides_array_gpu = reinterpret_cast<int *>(tmp_buffer->ptr());
+  int *y_strides_array_gpu =
+      reinterpret_cast<int *>(x_strides_array_gpu + max_dim);
+  int *out_dims_array_gpu =
+      reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
+
   paddle::memory::Copy(gplace,
                        x_strides_array_gpu,
                        cplace,
                        x_strides_array.data(),
                        bytes,
                        ctx.stream());
-
-  auto y_strides_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *y_strides_array_gpu =
-      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
   paddle::memory::Copy(gplace,
                        y_strides_array_gpu,
                        cplace,
                        y_strides_array.data(),
                        bytes,
                        ctx.stream());
-
-  auto out_dims_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
   paddle::memory::Copy(
       gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
 
@@ -1569,24 +1563,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
   int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
   if (dx) {
-    auto x_strides_order_tmp = paddle::memory::Alloc(
+    size_t dx_total_bytes = bytes * 2;
+    auto dx_tmp_buffer = paddle::memory::Alloc(
         ctx.GetPlace(),
-        bytes,
+        dx_total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_strides_order_gpu =
-        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
+    int *x_strides_order_gpu = reinterpret_cast<int *>(dx_tmp_buffer->ptr());
+    int *x_dims_order_gpu =
+        reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
+
     paddle::memory::Copy(gplace,
                          x_strides_order_gpu,
                          cplace,
                          x_strides_order.data(),
                          bytes,
                          ctx.stream());
-
-    auto x_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          x_dims_order_gpu,
                          cplace,
@@ -1610,24 +1601,22 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                                       dx_op);
   }
   if (dy) {
-    auto y_strides_order_tmp = paddle::memory::Alloc(
+    // One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
+    size_t dy_total_bytes = bytes * 2;
+    auto dy_tmp_buffer = paddle::memory::Alloc(
         ctx.GetPlace(),
-        bytes,
+        dy_total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_strides_order_gpu =
-        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
+    int *y_strides_order_gpu = reinterpret_cast<int *>(dy_tmp_buffer->ptr());
+    int *y_dims_order_gpu =
+        reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
+
     paddle::memory::Copy(gplace,
                          y_strides_order_gpu,
                          cplace,
                          y_strides_order.data(),
                          bytes,
                          ctx.stream());
-
-    auto y_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          y_dims_order_gpu,
                          cplace,
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
index c43c3c04755f3cd98d1b8419d296eaec9022c62f..3961f82c8fd0ff707b00404f374eb576cc6aa4aa 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -55,11 +55,14 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
     cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
   }
 
-  // Copy the addresses of A and A_inv from host to device.
+  // Copy the addresses of A and A_inv from host to device,
+  // and allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
   paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
       paddle::memory::Alloc(
           dev_ctx.GetPlace(),
-          cpu_ptrs.size() * sizeof(T*),
+          total_bytes,
           phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   paddle::memory::Copy(dev_ctx.GetPlace(),
                        tmp_gpu_ptrs_data->ptr(),
@@ -67,20 +70,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
                        static_cast<void*>(cpu_ptrs.data()),
                        cpu_ptrs.size() * sizeof(T*),
                        dev_ctx.stream());
-  T** gpu_inv_ptrs =
-      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-  // Allocate device memory for info and pivots.
-  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
-      paddle::memory::Alloc(
-          dev_ctx.GetPlace(),
-          num_ints * sizeof(int),
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+  T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
+  T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
+  int* gpu_info_ptr =
+      reinterpret_cast<int*>(gpu_inv_pivot_info + cpu_ptrs.size());
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
   std::vector<int> info;  // only for singular checking
   info.resize(batch_size);
   // This functions in cuBLAS is intended to be used for matrices of small
@@ -100,8 +95,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
     // This function performs the LU factorization of each matrix A by the
     // equation P * A = L * U. L and U are written back to original matrix A,
     // and diagonal elements of L are discarded.
-    int* gpu_pivot_ptr =
-        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    int* gpu_pivot_ptr = gpu_info_ptr + batch_size;
     blas.BatchedGETRF(n,
                       reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
                       gpu_pivot_ptr,
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index 17b87a0e17d5127bb8d125cf489f3efdfbfb00cc..f1656c8a177a73eb72028b75282e620fbae4ef42 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -371,6 +371,13 @@ inline int PoolOutputSize(int input_size,
                           int padding_2,
                           int stride,
                           bool ceil_mode) {
+  PADDLE_ENFORCE_NE(
+      stride,
+      0,
+      phi::errors::InvalidArgument(
+          "The stride of PoolOutputSize shall not be 0, but received %d.",
+          stride));
+
   int output_size;
   if (!ceil_mode) {
     output_size =
@@ -402,6 +409,11 @@ inline int MaxPoolOutputSize(int input_size,
                              int filter_size,
                              int padding,
                              int stride) {
+  PADDLE_ENFORCE_NE(
+      stride,
+      0,
+      phi::errors::InvalidArgument(
+          "The stride of MaxPool shall not be 0, but received %d.", stride));
   int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
   return output_size;
 }
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index b48f2eb4cdf2b2f4b3545a42c9497394b1d755ef..0b9b852a7585d02308179c1d00b889ac508969ee 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -1228,8 +1228,22 @@ void HandleLargeDim(const DeviceContext& dev_ctx,
 
   // transpose to 2D tensor whose shape is {unreduced, reduced}.
   const int64_t unreduced = output->numel();
-  const int64_t reduced = shuffled_input.numel() / unreduced;
+  const int64_t input_numel = shuffled_input.numel();
+  // assume: 0 / 0 == 0, which allow process 0 dim tensor
+  const int64_t reduced = (unreduced != 0) ? (input_numel / unreduced) : 0;
+
+  PADDLE_ENFORCE_EQ(
+      unreduced * reduced,
+      input_numel,
+      phi::errors::InvalidArgument(
+          "Reducing failed in HandleLargeDim, when try to transpose (%d) "
+          "operands into 2D tensor with shape (%d, %d).",
+          input_numel,
+          unreduced,
+          reduced));
+
   shuffled_input.ResizeAndAllocate({unreduced, reduced});
+
   DDim output_dim = output->dims();
   output->ResizeAndAllocate({unreduced});
   ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
index 3ab7618adec48b1910648967fbae34327376c4d8..1b0f34b943d5afcdc2ed708dfa50af860129e9a5 100644
--- a/paddle/phi/kernels/funcs/reduce_grad_functions.h
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -87,7 +87,20 @@ void HandleLargeDimGrad(const Context& dev_ctx,
                         Functor functor,
                         const std::vector<int>& dims) {
   const int64_t unreduced = out->numel();
-  const int64_t reduced = x->numel() / unreduced;
+  const int64_t x_numel = x->numel();
+  // assume: 0 / 0 == 0, which allow process 0 dim tensor
+  const int64_t reduced = (unreduced != 0) ? (x_numel / unreduced) : 0;
+
+  PADDLE_ENFORCE_EQ(
+      unreduced * reduced,
+      x_numel,
+      phi::errors::InvalidArgument(
+          "Reducing failed in HandleLargeDimGrad, when try to transpose (%d) "
+          "operands into 2D tensor with shape (%d, %d).",
+          x_numel,
+          unreduced,
+          reduced));
+
   DDim out_dim(out->dims());
   DDim x_dim(x->dims());
   // transpose and reshape X
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index aa03eb4e9fcd21e1c5ac6901bb7a30d7fd1f895a..cacaa8f81fe862aefdcf126cac4cceacdb0ec384 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
@@ -34,6 +35,26 @@ enum class SegmentedArraySize {
   kFixed64 = 64,
 };
 
+template <typename T, SegmentedArraySize Size, int Num = static_cast<int>(Size)>
+struct PADDLE_ALIGN(256) ValueArray {
+ public:
+  T data[Num];
+
+  void Set(T* ptr, const int num) {
+    for (auto i = 0; i < num; ++i) {
+      data[i] = ptr[i];
+    }
+  }
+};
+
+template <typename T>
+struct PADDLE_ALIGN(256) ValueArray<T, SegmentedArraySize::kVariableLength, 0> {
+ public:
+  T* data{nullptr};
+
+  void Set(T* ptr, const int num) { data = ptr; }
+};
+
 template <typename T, SegmentedArraySize Size>
 struct PADDLE_ALIGN(256) ConstPointerArray {
  public:
@@ -62,8 +83,8 @@ struct PADDLE_ALIGN(256) PointerArray {
  public:
   T* data[static_cast<int>(Size)];
 
-  void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
-    for (auto i = 0; i < ptrs.size(); ++i) {
+  void Set(T** ptrs, const int num, T** dev_ptr = nullptr) {
+    for (auto i = 0; i < num; ++i) {
       data[i] = ptrs[i];
     }
   }
@@ -74,9 +95,7 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
  public:
   T** data{nullptr};
 
-  void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
-    data = dev_ptr;
-  }
+  void Set(T** ptrs, const int num, T** dev_ptr = nullptr) { data = dev_ptr; }
 };
 
 #undef PADDLE_ALIGN
@@ -84,13 +103,24 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
 template <typename Context>
 struct ArraySetterBase {
  protected:
-  void* AllocAndCopy(const Context& ctx, void* src, size_t num_bytes) {
+  void* AllocAndCopy(const Context& ctx,
+                     void* src,
+                     size_t num_bytes,
+                     bool use_cuda_graph = false) {
     allocation = paddle::memory::Alloc(
         ctx.GetPlace(),
         num_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+
+    int8_t* restored = reinterpret_cast<int8_t*>(src);
+#ifdef PADDLE_WITH_CUDA
+    if (use_cuda_graph) {
+      restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph<int8_t>(
+          restored, num_bytes);
+    }
+#endif
     phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(),
-                                       src,
+                                       restored,
                                        num_bytes,
                                        phi::gpuMemcpyHostToDevice,
                                        ctx.stream());
@@ -131,13 +161,28 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
  public:
   PointerArray<T, Size> array;
 
-  PointerArraySetter(const Context& ctx, std::vector<DenseTensor*>* t) {
+  // need_alloc : tensor data needs extra buffer or not.
+  // use_cuda_graph: tensor data shall be captured by cuda_graph or not.
+  // pre_alloc_host_buf: tensor data is temporaily stored by pinned memory or
+  // not.
+  PointerArraySetter(const Context& ctx,
+                     std::vector<DenseTensor*>* t,
+                     bool need_alloc = false,
+                     bool use_cuda_graph = false,
+                     T** pre_alloc_host_buf = nullptr) {
     ptrs.resize(t->size());
+    T** data_ptr = ptrs.data();
+#ifdef PADDLE_WITH_HIP
+    if (pre_alloc_host_buf) {
+      data_ptr = pre_alloc_host_buf;
+    }
+#endif
     for (int i = 0; i < t->size(); ++i) {
       if (t->at(i) && (t->at(i)->numel() > 0)) {
-        ptrs[i] = ctx.template Alloc<T>(t->at(i));
+        data_ptr[i] =
+            need_alloc ? ctx.template Alloc<T>(t->at(i)) : t->at(i)->data<T>();
       } else {
-        ptrs[i] = nullptr;
+        data_ptr[i] = nullptr;
       }
     }
 
@@ -145,10 +190,9 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
     if (Size == SegmentedArraySize::kVariableLength) {
       size_t num_bytes = t->size() * sizeof(T*);
       dev_ptr = reinterpret_cast<T**>(this->AllocAndCopy(
-          ctx, reinterpret_cast<void*>(ptrs.data()), num_bytes));
+          ctx, reinterpret_cast<void*>(data_ptr), num_bytes, use_cuda_graph));
     }
-
-    array.Set(ptrs, dev_ptr);
+    array.Set(data_ptr, t->size(), dev_ptr);
   }
 
  private:
diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h
index c516d4892bf629226bed9bb8f93cd3436792d639..0b2b5443383a94d506f7cc2a2749f0594b82bebc 100644
--- a/paddle/phi/kernels/funcs/stack_and_unstack.h
+++ b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -192,7 +192,7 @@ void LaunchUnStackKernel(const Context& ctx,
           << ", out_col=" << out_col << ", num_splits=" << num_splits;
 
   auto x_ptr = x.data<T>();
-  PointerArraySetter<Context, T, Size> setter(ctx, outs);
+  PointerArraySetter<Context, T, Size> setter(ctx, outs, /*need_alloc=*/true);
 
   if (out_col == 1) {
     // For the case axis == (x.dims().size() - 1)
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 63202ca4a484d134cf5ce0f75b3612379be59a12..d4314307873f406c0f04b9030c8007d635275a31 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -354,12 +354,6 @@ struct MatrixEighFunctor<GPUContext, T> {
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
     ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
-    auto info = paddle::memory::Alloc(
-        dev_ctx.GetPlace(),
-        sizeof(int) * batch_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
-
     DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
     T *input_vector = input_trans.data<T>();
 
@@ -410,11 +404,13 @@ struct MatrixEighFunctor<GPUContext, T> {
                 out_value,
                 &workspace_size);
     }
+    size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
     auto work = paddle::memory::Alloc(
         dev_ctx.GetPlace(),
-        sizeof(T) * workspace_size,
+        total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    auto *info_ptr = reinterpret_cast<int *>(work_ptr + workspace_size);
 
     for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;
diff --git a/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f54db963b09deb799f41690fc654213f5a0ab05c
--- /dev/null
+++ b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
@@ -0,0 +1,616 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using dnnl::engine;
+using dnnl::inner_product_forward;
+using dnnl::memory;
+using dnnl::prop_kind;
+using dnnl::stream;
+using paddle::framework::ReshapeToMatrix;
+
+namespace phi {
+
+template <typename XT, typename YT, typename OT>
+class FusedMatmulOneDNNHandler
+    : public funcs::OneDNNHandlerNoCachingT<XT, dnnl::matmul> {
+ public:
+  FusedMatmulOneDNNHandler(const OneDNNContext &dev_ctx,
+                           const DenseTensor *residual_data,
+                           const std::vector<int64_t> &x_org_dims,
+                           const std::vector<int64_t> &y_org_dims,
+                           bool trans_x,
+                           bool trans_y,
+                           const float matmul_alpha,
+                           const std::vector<int64_t> &x_strides_override,
+                           const std::vector<int64_t> &y_strides_override,
+                           bool is_output_fused,
+                           const std::string &fuse_activation,
+                           const float fuse_alpha,
+                           const float fuse_beta,
+                           const float fused_output_scale,
+                           const float scale_x,
+                           const float scale_y,
+                           const float scale_in_eltwise,
+                           const float scale_out,
+                           const bool force_fp32_output)
+      : funcs::OneDNNHandlerNoCachingT<XT, dnnl::matmul>(dev_ctx.GetEngine(),
+                                                         dev_ctx.GetPlace()) {
+    // M X K * K X N
+    std::vector<int64_t> x_dims(x_org_dims);
+    std::vector<int64_t> y_dims(y_org_dims);
+
+    const int MB_idx = x_dims.size() - 3;
+    const int H_idx = x_dims.size() - 2;
+    const int W_idx = x_dims.size() - 1;
+
+    if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]);
+    if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]);
+
+    const memory::dim M = x_dims[H_idx];
+    const memory::dim K = x_dims[W_idx];
+    const memory::dim N = y_dims[W_idx];
+
+    std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
+
+    x_strides.reserve(x_dims.size());
+    y_strides.reserve(x_dims.size());
+    out_strides.reserve(x_dims.size());
+
+    if (x_strides_override.empty()) {
+      if (trans_x) {
+        x_strides.insert(x_strides.end(), {M * K, 1, M});
+      } else {
+        x_strides.insert(x_strides.end(), {M * K, K, 1});
+      }
+    } else {
+      x_strides = x_strides_override;
+    }
+
+    if (y_strides_override.empty()) {
+      if (trans_y) {
+        y_strides.insert(y_strides.end(), {N * K, 1, K});
+      } else {
+        y_strides.insert(y_strides.end(), {N * K, N, 1});
+      }
+    } else {
+      y_strides = y_strides_override;
+    }
+
+    out_strides.insert(out_strides.end(), {M * N, N, 1});
+    out_ddims.insert(out_ddims.end(),
+                     {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
+
+    for (int i = x_dims.size() - 4; i >= 0; --i) {
+      out_ddims[i] = std::max(x_dims[i], y_dims[i]);
+      if (x_strides_override.empty()) {
+        x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
+      }
+      if (y_strides_override.empty()) {
+        y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
+      }
+      out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
+    }
+
+    // TODO(jczaja): Why not for int8??
+    if (!funcs::is_int8<OT>() && is_output_fused) {
+      out_strides = FakeTransposeStrides(out_ddims);
+    }
+
+    auto x_md = memory::desc(x_dims, funcs::OneDNNGetDataType<XT>(), x_strides);
+    auto y_md = memory::desc(y_dims, funcs::OneDNNGetDataType<YT>(), y_strides);
+    auto out_md =
+        memory::desc(out_ddims, funcs::OneDNNGetDataType<OT>(), out_strides);
+
+    const auto matmul_attrs = CreateMatmulAttrs(dev_ctx,
+                                                residual_data,
+                                                matmul_alpha,
+                                                fuse_activation,
+                                                fuse_alpha,
+                                                fuse_beta,
+                                                fused_output_scale,
+                                                scale_x,
+                                                scale_y,
+                                                scale_in_eltwise,
+                                                scale_out,
+                                                force_fp32_output);
+
+    this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md);
+  }
+
+  float ComputeOutputScale(float matmul_alpha,
+                           const float scale_x,
+                           const float scale_y,
+                           const float scale_in_eltwise,
+                           const float scale_out,
+                           const bool force_fp32_output) {
+    float f_scale_out = force_fp32_output ? 1.0f : scale_out;
+    matmul_alpha *= f_scale_out / (scale_x * scale_y);
+    return matmul_alpha;
+  }
+
+  dnnl::primitive_attr CreateMatmulAttrs(const OneDNNContext &dev_ctx,
+                                         const DenseTensor *residual_data,
+                                         const float matmul_alpha,
+                                         const std::string &fuse_activation,
+                                         const float fuse_alpha,
+                                         const float fuse_beta,
+                                         const float fused_output_scale,
+                                         const float scale_x,
+                                         const float scale_y,
+                                         const float scale_in_eltwise,
+                                         const float scale_out,
+                                         const bool force_fp32_output) {
+    dnnl::primitive_attr matmul_attrs;
+    dnnl::post_ops post_operations;
+
+    float computed_scale_out = ComputeOutputScale(matmul_alpha,
+                                                  scale_x,
+                                                  scale_y,
+                                                  scale_in_eltwise,
+                                                  scale_out,
+                                                  force_fp32_output);
+    if (computed_scale_out != 1.0f) {
+      matmul_attrs.set_output_scales(0, {computed_scale_out});
+    }
+
+    if (residual_data) {
+      auto residual_data_tz = vectorize(residual_data->dims());
+      auto residual_data_md = memory::desc(residual_data_tz,
+                                           funcs::OneDNNGetDataType<OT>(),
+                                           dnnl::memory::format_tag::any);
+      post_operations.append_binary(dnnl::algorithm::binary_add,
+                                    residual_data_md);
+      if (scale_in_eltwise != 0.0f) {
+        float sum_scale = scale_out / scale_in_eltwise;
+        post_operations.append_sum(sum_scale);
+      }
+    }
+
+    funcs::AppendActivation(
+        dev_ctx, post_operations, 1.0f, fuse_activation, fuse_alpha, fuse_beta);
+
+    if (fused_output_scale != 1.0f) {
+      post_operations.append_eltwise(
+          1.0, dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
+    }
+
+    matmul_attrs.set_post_ops(post_operations);
+    return matmul_attrs;
+  }
+
+  std::vector<int64_t> FakeTransposeStrides(
+      const std::vector<int64_t> &matmul_out_dims) const {
+    // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
+    // transpose axis are: {0, 2, 1, 3}
+    std::vector<int64_t> transpose_axis = {0, 2, 1, 3};
+    std::vector<int64_t> fake_strides(transpose_axis.size());
+    int ndims = static_cast<int>(transpose_axis.size());
+
+    int total_stride = 1;
+
+    for (int i = ndims - 1; i >= 0; --i) {
+      fake_strides[transpose_axis[i]] = total_stride;
+      total_stride *= matmul_out_dims[transpose_axis[i]];
+    }
+
+    return fake_strides;
+  }
+
+  std::shared_ptr<memory> AcquireWeightsMemory(const DenseTensor *input) {
+    const YT *input_data = input->data<YT>();
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->weights_desc(), funcs::to_void_cast<YT>(input_data));
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(const OneDNNContext &dev_ctx,
+                                                 DenseTensor *output) {
+    // We cannot use base AcquireDstMemory as it makes an allocation request
+    // base on DST memory primitive size. This is fine in general, but in MatMul
+    // we have primitive that covers only one batch of Data and then shift
+    // pointer for every new batch. Hence DenseTensor size is bigger that
+    // dst memory primitive size. So would we request less memory that is there
+    // and it triggers an assertion.  So as there is no 'any' format here we can
+    // leave default size of DenseTensor as computed in ComputeInferShape
+    OT *ptr = dev_ctx.template Alloc<OT>(output);
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
+  }
+};
+
+static DDim RowMatrixDimsFromVector(const DDim &x_dim) {
+  return x_dim.size() > 1 ? x_dim : make_ddim({1, x_dim[0]});
+}
+
+static DDim ColumnMatrixDimsFromVector(const DDim &y_dim) {
+  return y_dim.size() > 1 ? y_dim : make_ddim({y_dim[0], 1});
+}
+
+static std::vector<int64_t> TransposeAxis(const std::vector<int64_t> &x,
+                                          const std::vector<int> &axis) {
+  size_t in_rank = x.size();
+  size_t axis_size = axis.size();
+
+  auto axis_set = std::set<int>(axis.begin(), axis.end());
+  PADDLE_ENFORCE_EQ(axis_set.size(),
+                    axis_size,
+                    phi::errors::InvalidArgument(
+                        "In an axis array, elements must be unique."));
+
+  PADDLE_ENFORCE_EQ(
+      in_rank,
+      axis_size,
+      phi::errors::InvalidArgument("The input dimension's size "
+                                   "should be equal to the axis's size. "
+                                   "But received dimension is %d, "
+                                   "axis's size is %d",
+                                   in_rank,
+                                   axis_size));
+
+  PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()),
+                    axis_size,
+                    phi::errors::InvalidArgument(
+                        "Axis values must be ranging from 0 to (dims - 1)."));
+
+  std::vector<int64_t> new_x(x.size());
+  for (size_t i = 0; i < x.size(); i++) {
+    new_x[i] = x[axis[i]];
+  }
+  return new_x;
+}
+
+static std::vector<int64_t> GetInputStrides(const std::string input_name,
+                                            const DDim &input_dims,
+                                            std::vector<int> shape,
+                                            std::vector<int> axis,
+                                            const bool transpose_input) {
+  auto new_dims = input_dims;
+  if (!shape.empty() && !axis.empty()) {
+    new_dims = input_dims.reshape(shape).transpose(axis);
+  }
+
+  auto &MatrixDimsFromVector =
+      input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector;
+  funcs::MatDescriptor mat_dim = funcs::CreateMatrixDescriptor(
+      MatrixDimsFromVector(new_dims), 0, transpose_input);
+
+  std::vector<int64_t> strides;
+  if (!shape.empty()) {
+    auto shape2 = input_dims.reshape(shape);
+    strides.push_back(1);
+    for (auto i = shape2.size() - 1; i > 0; --i) {
+      strides.insert(strides.begin(),
+                     strides.front() * static_cast<int64_t>(shape2[i]));
+    }
+    strides = TransposeAxis(strides, axis);
+    if (shape.size() == 2)
+      strides.insert(strides.begin(),
+                     static_cast<int64_t>(shape[0] * shape[1]));
+    mat_dim.stride_ = strides[0];
+    if (mat_dim.trans_) std::swap(*strides.rbegin(), *(++strides.rbegin()));
+  }
+  return strides;
+}
+
+template <typename T, typename T_out>
+void ExecuteFusedMatmul(const OneDNNContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor *residual_data,
+                        const std::vector<int64_t> &x_dims,
+                        const std::vector<int64_t> &y_dims,
+                        bool trans_x,
+                        bool trans_y,
+                        const float matmul_alpha,
+                        const std::vector<int64_t> &x_strides_override,
+                        const std::vector<int64_t> &y_strides_override,
+                        const bool is_output_fused,
+                        const std::vector<int> &fused_transpose_Out,
+                        const std::string &fuse_activation,
+                        const float fuse_alpha,
+                        const float fuse_beta,
+                        const float fused_output_scale,
+                        const float scale_x,
+                        const float scale_y,
+                        const float scale_in_eltwise,
+                        const float scale_out,
+                        const bool force_fp32_output,
+                        DenseTensor *out) {
+  FusedMatmulOneDNNHandler<T, T, T_out> handler(dev_ctx,
+                                                residual_data,
+                                                x_dims,
+                                                y_dims,
+                                                trans_x,
+                                                trans_y,
+                                                matmul_alpha,
+                                                x_strides_override,
+                                                y_strides_override,
+                                                is_output_fused,
+                                                fuse_activation,
+                                                fuse_alpha,
+                                                fuse_beta,
+                                                fused_output_scale,
+                                                scale_x,
+                                                scale_y,
+                                                scale_in_eltwise,
+                                                scale_out,
+                                                force_fp32_output);
+
+  const auto src_memory_p = handler.AcquireSrcMemory(&x);
+  const auto weights_memory_p = handler.AcquireWeightsMemory(&y);
+  const auto dst_memory_p = handler.AcquireDstMemory(dev_ctx, out);
+
+  auto matmul_p = handler.AcquireForwardPrimitive();
+
+  std::unordered_map<int, memory> matmul_args = {
+      {DNNL_ARG_SRC, *src_memory_p},
+      {DNNL_ARG_WEIGHTS, *weights_memory_p},
+      {DNNL_ARG_DST, *dst_memory_p}};
+
+  if (residual_data) {
+    const auto residual_data_memory_p = handler.AcquireSrcMemory(residual_data);
+    matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
+                        *residual_data_memory_p});
+  }
+
+  auto &astream = OneDNNContext::tls().get_stream();
+  matmul_p->execute(astream, matmul_args);
+  astream.wait();
+
+  if (is_output_fused && !funcs::is_int8<T_out>()) {
+    auto permuted_md =
+        dst_memory_p->get_desc().permute_axes(fused_transpose_Out);
+    out->set_mem_desc(permuted_md.reshape(vectorize<int64_t>(out->dims())));
+  } else {
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims())));
+  }
+}
+
+std::vector<int64_t> GetInputShape(DDim input_dims,
+                                   std::vector<int> shape,
+                                   std::vector<int> axis) {
+  if (!shape.empty() && !axis.empty()) {
+    return vectorize(input_dims.reshape(shape).transpose(axis));
+  }
+  return vectorize(input_dims);
+}
+
+void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
+                         const std::vector<int64_t> &y_dims,
+                         std::vector<int64_t> *x_bd_dims,
+                         std::vector<int64_t> *y_bd_dims,
+                         DenseTensor *out,
+                         const bool is_output_fused) {
+  if (x_dims.size() == 1) {
+    (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
+  } else if (x_dims.size() == 2) {
+    (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1];
+    (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0];
+  } else {
+    for (size_t i = 0; i < x_dims.size(); ++i) {
+      (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i];
+    }
+  }
+  if (y_dims.size() == 1) {
+    (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0];
+  } else if (y_dims.size() == 2) {
+    (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1];
+    (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0];
+  } else {
+    for (size_t i = 0; i < y_dims.size(); ++i) {
+      (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i];
+    }
+  }
+
+  if (!is_output_fused && x_dims.size() > 2 && y_dims.size() > 2) {
+    auto out_dims = vectorize(out->dims());
+    for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 ||
+              (*y_bd_dims)[i] == 1,
+          true,
+          errors::InvalidArgument(
+              "Tensor dimensions are incorrect for broadcasting."
+              "Dimensions in X and Y must be same or equal to 1, but "
+              "received x_dim[%d]=%d and y_dims[%d]= %d",
+              i,
+              (*x_bd_dims)[i],
+              i,
+              (*y_bd_dims)[i]));
+      (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
+    }
+    out->Resize(make_ddim((out_dims)));
+  }
+}
+
+template <typename T, typename Context>
+void FusedMatmulKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &y,
+                       const paddle::optional<DenseTensor> &residual_data,
+                       bool transpose_x,
+                       bool transpose_y,
+                       const float matmul_alpha,
+                       const std::string &fuse_activation,
+                       const float fuse_alpha,
+                       const float fuse_beta,
+                       const float fused_output_scale,
+                       const std::vector<int> &fused_reshape_X,
+                       const std::vector<int> &fused_transpose_X,
+                       const std::vector<int> &fused_reshape_Y,
+                       const std::vector<int> &fused_transpose_Y,
+                       const std::vector<int> &fused_reshape_Out,
+                       const std::vector<int> &fused_transpose_Out,
+                       const std::string &mkldnn_data_type,
+                       const float scale_x,
+                       const float scale_y,
+                       const float scale_in_eltwise,
+                       const float scale_out,
+                       const bool force_fp32_output,
+                       DenseTensor *out) {
+  if (dev_ctx.HasDnnAttr("head_number")) {
+    const auto head_number =
+        PADDLE_GET_CONST(int, dev_ctx.GetDnnAttr("head_number"));
+    PADDLE_ENFORCE_EQ(
+        head_number,
+        1,
+        errors::Unimplemented(
+            "oneDNN matmul doesn't support multiple heads. Expected "
+            "head_number=1. But received `head_number` is %d",
+            head_number));
+  }
+
+  constexpr bool is_int8 = funcs::is_int8<T>();
+  constexpr bool is_bfloat16 = funcs::is_bfloat16<T>();
+
+  bool fuse_relu = false;
+  if (fuse_activation == "relu" || fuse_activation == "relu6") {
+    fuse_relu = true;
+  }
+
+  auto x_dims = GetInputShape(x.dims(), fused_reshape_X, fused_transpose_X);
+  auto y_dims = GetInputShape(y.dims(), fused_reshape_Y, fused_transpose_Y);
+  auto is_output_fused =
+      !fused_reshape_Out.empty() && !fused_transpose_Out.empty();
+
+  auto x_strides_override = GetInputStrides(
+      "X", x.dims(), fused_reshape_X, fused_transpose_X, transpose_x);
+  auto y_strides_override = GetInputStrides(
+      "Y", y.dims(), fused_reshape_Y, fused_transpose_Y, transpose_y);
+
+  int ndims = std::max(x_dims.size(), y_dims.size());
+  ndims = std::max(ndims, 3);
+
+  std::vector<int64_t> x_bd_dims(ndims, 1);
+  std::vector<int64_t> y_bd_dims(ndims, 1);
+
+  CalculateMatrixDims(
+      x_dims, y_dims, &x_bd_dims, &y_bd_dims, out, is_output_fused);
+
+  if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) {
+    ExecuteFusedMatmul<T, float>(dev_ctx,
+                                 x,
+                                 y,
+                                 residual_data.get_ptr(),
+                                 x_bd_dims,
+                                 y_bd_dims,
+                                 transpose_x,
+                                 transpose_y,
+                                 matmul_alpha,
+                                 x_strides_override,
+                                 y_strides_override,
+                                 is_output_fused,
+                                 fused_transpose_Out,
+                                 fuse_activation,
+                                 fuse_alpha,
+                                 fuse_beta,
+                                 fused_output_scale,
+                                 scale_x,
+                                 scale_y,
+                                 scale_in_eltwise,
+                                 scale_out,
+                                 force_fp32_output,
+                                 out);
+  } else if (is_bfloat16) {
+    ExecuteFusedMatmul<T, phi::dtype::bfloat16>(dev_ctx,
+                                                x,
+                                                y,
+                                                residual_data.get_ptr(),
+                                                x_bd_dims,
+                                                y_bd_dims,
+                                                transpose_x,
+                                                transpose_y,
+                                                matmul_alpha,
+                                                x_strides_override,
+                                                y_strides_override,
+                                                is_output_fused,
+                                                fused_transpose_Out,
+                                                fuse_activation,
+                                                fuse_alpha,
+                                                fuse_beta,
+                                                fused_output_scale,
+                                                scale_x,
+                                                scale_y,
+                                                scale_in_eltwise,
+                                                scale_out,
+                                                force_fp32_output,
+                                                out);
+  } else if (fuse_relu) {
+    ExecuteFusedMatmul<T, uint8_t>(dev_ctx,
+                                   x,
+                                   y,
+                                   residual_data.get_ptr(),
+                                   x_bd_dims,
+                                   y_bd_dims,
+                                   transpose_x,
+                                   transpose_y,
+                                   matmul_alpha,
+                                   x_strides_override,
+                                   y_strides_override,
+                                   is_output_fused,
+                                   fused_transpose_Out,
+                                   fuse_activation,
+                                   fuse_alpha,
+                                   fuse_beta,
+                                   fused_output_scale,
+                                   scale_x,
+                                   scale_y,
+                                   scale_in_eltwise,
+                                   scale_out,
+                                   force_fp32_output,
+                                   out);
+  } else {
+    ExecuteFusedMatmul<T, int8_t>(dev_ctx,
+                                  x,
+                                  y,
+                                  residual_data.get_ptr(),
+                                  x_bd_dims,
+                                  y_bd_dims,
+                                  transpose_x,
+                                  transpose_y,
+                                  matmul_alpha,
+                                  x_strides_override,
+                                  y_strides_override,
+                                  is_output_fused,
+                                  fused_transpose_Out,
+                                  fuse_activation,
+                                  fuse_alpha,
+                                  fuse_beta,
+                                  fused_output_scale,
+                                  scale_x,
+                                  scale_y,
+                                  scale_in_eltwise,
+                                  scale_out,
+                                  force_fp32_output,
+                                  out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_matmul,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedMatmulKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index 8a4aa2a6397c91dd4258032f358288e11be8450b..6cdad23bfd5e180ecd943e1462de111c2bf318c9 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -82,6 +82,14 @@ void AccuracyRawKernel(const Context& dev_ctx,
   const int64_t* indices_data = indices.data<int64_t>();
   const int64_t* label_data = label.data<int64_t>();
 
+  PADDLE_ENFORCE_EQ(
+      inference.dims().size(),
+      2,
+      phi::errors::InvalidArgument(
+          "Rank(Input) of AccuracyOp must be 2, with shape "
+          "[sample_number, class_dim], But received rank(Input) is %d",
+          inference.dims().size()));
+
   int* correct_data = dev_ctx.template Alloc<int>(correct);
   int* total_data = dev_ctx.template Alloc<int>(total);
   T* accuracy_data = dev_ctx.template Alloc<T>(accuracy);
@@ -91,6 +99,21 @@ void AccuracyRawKernel(const Context& dev_ctx,
   auto stream = dev_ctx.stream();
   phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
 
+  PADDLE_ENFORCE_GT(label.dims().size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Rank(Label) of AccuracyOp must greater than 0, "
+                        "But received rank(Label) is %d",
+                        label.dims().size()));
+
+  PADDLE_ENFORCE_GE(
+      label.dims()[0],
+      inference.dims()[0],
+      phi::errors::InvalidArgument("num_samples(%d) of Label should less than "
+                                   "or equal to num_samples(%d) of Input",
+                                   label.dims()[0],
+                                   num_samples));
+
   if (num_samples == 0) {
     return;
   }
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index f32ba597f5b68bce31f90795431c01ea78819cb2..69bc248a7e2f2c87b9e90d14e22e6f6496bc3223 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,11 +14,10 @@
 
 #include "paddle/phi/kernels/add_n_kernel.h"
 
-#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
 namespace phi {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
@@ -38,16 +37,18 @@ __global__ void Sum2CUDAKernel(const T *in_0,
 template <class T>
 __global__ void SumArrayCUDAKernel(
     T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
+    MPType total(read_dst ? static_cast<MPType>(out[id])
+                          : static_cast<MPType>(0));
     for (int i = 0; i < in_size; ++i) {
       const T *tmp = in[i];
       if (tmp) {
-        total += tmp[id];
+        total += static_cast<MPType>(tmp[id]);
       }
     }
-    out[id] = total;
+    out[id] = static_cast<T>(total);
     id += blockDim.x * gridDim.x;
   }
 }
@@ -116,11 +117,12 @@ void AddNKernel(const Context &dev_ctx,
     int64_t length_0 = in_0.numel();
     int64_t length_1 = in_1.numel();
     if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
+      using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
-      auto in_0_e = EigenVector<T>::Flatten(in_0);
-      auto in_1_e = EigenVector<T>::Flatten(in_1);
-      result.device(place) = in_0_e + in_1_e;
+      auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
+      auto in_1_e = EigenVector<T>::Flatten(in_1).template cast<MPType>();
+      result.device(place) = (in_0_e + in_1_e).template cast<T>();
     } else if (length_0 && in_0.IsInitialized()) {
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index affd36a95ef8bc56c4ecd3225b8f20ffdf3f2485..199ecc8e5b9890b4cb6a7096a1452bd4f45dad5a 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
 
 namespace {  // NOLINT
@@ -180,6 +181,12 @@ struct VisitDataCudaArgMinMaxFunctor {
       x_dims = x.dims();
       if (axis < 0) new_axis = axis + x.dims().size();
     }
+    // For 0D Tensor
+    if (x.dims().size() == 0) {
+      dev_ctx.template Alloc<IndType>(out);
+      phi::funcs::set_constant(dev_ctx, out, 0);
+      return;
+    }
 
     int64_t numel = x.numel();
     int64_t groups = numel / x_dims[new_axis];
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 02f06edd04f7da55d412e422ef8438edd7760a63..a9cc8f591be7c3cb2e9122280fc99c684283f7fd 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -28,8 +28,10 @@ namespace phi {
 
 template <typename T, typename Context>
 void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<const DenseTensor*>& inputs,
                                 const std::vector<const DenseTensor*>& dout,
                                 std::vector<DenseTensor*> dx) {
+  (void)inputs;
   // Find reduce dimensions
   const auto& in_tensors = dout;
   auto& out_tensors = dx;
diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu
index c7a6261ce381e924aff2902176623bbcc3b87029..fb869a00d9c505d1d183b1d07a4d3f16d6d908c1 100644
--- a/paddle/phi/kernels/gpu/p_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu
@@ -105,6 +105,13 @@ void PNormKernel(const Context& dev_ctx,
   std::vector<int> reduce_axis =
       funcs::details::GetReduceDim(axis_dims, xdim.size(), asvector);
 
+  for (int i = 0; i < xdim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      xdim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   using MT = typename dtype::MPTypeTrait<T>::Type;
   if (porder == 0) {
     phi::funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
index c280873d8f768214807e9dec4e39541a9a529e73..2a9512e495cc920db8dc71643921ff60e3648480 100644
--- a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
@@ -77,10 +77,20 @@ class PreluOpGradFunctor {
     for (size_t i = 0; i < input_dims.size(); ++i) {
       numel *= input_dims[i];
     }
-    size_t plane_size = numel / input_dims[0] / input_dims[1];
-    size_t spatial_size = numel / input_dims[0];
-    size_t channel =
-        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
+
+    size_t plane_size;
+    size_t spatial_size;
+    size_t channel;
+    if (mode == PRELU_Scalar) {
+      plane_size = 1;
+      spatial_size = 1;
+      channel = 1;
+    } else {
+      plane_size = numel / input_dims[0] / input_dims[1];
+      spatial_size = numel / input_dims[0];
+      channel = mode == ChannelLast ? input_dims[input_dims.size() - 1]
+                                    : input_dims[1];
+    }
 
     PReluOpGradKernel<T>
         <<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
@@ -120,7 +130,6 @@ void PReluGradKernel(const Context& dev_ctx,
   int numel = x.numel();
   auto dim = x.dims();
   auto x_rank = dim.size();
-  std::vector<int> input_shape = phi::vectorize<int>(dim);
   auto stream = dev_ctx.stream();
 
   T* alpha_grad_tmp_ptr;
diff --git a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
index 9c04ab511d04fd4e7147a5e5f6a6fb8917e63717..c5c7af17e5de862527230fa6a89a5ce313dd6b84 100644
--- a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
@@ -54,6 +54,7 @@ void UniqueConsecutiveKernel(const Context& dev_ctx,
   } else {
     // 'axis' is required.
     int valid_axis = axis[0];
+    if (valid_axis < 0) valid_axis += x.dims().size();
     phi::VisitDataTypeTiny(
         data_type,
         UniqueConsecutiveDimsCUDAFunctor<Context, T>(dev_ctx,
diff --git a/paddle/phi/kernels/impl/crop_kernel_impl.h b/paddle/phi/kernels/impl/crop_kernel_impl.h
index d3cb672104d6792a6dedffb4e4da293d16fa4f1f..5aa951d4da09d69a949a932aa2af1499c148c445 100644
--- a/paddle/phi/kernels/impl/crop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/crop_kernel_impl.h
@@ -100,6 +100,16 @@ void CropTensorFunction(const Context& dev_ctx,
   out->Resize(out_dims);
   dev_ctx.template Alloc<T>(out);
   for (size_t i = 0; i < offsets_vec.size(); ++i) {
+    PADDLE_ENFORCE_GE(
+        offsets_vec[i],
+        0,
+        errors::InvalidArgument("The offsets (%d) of the %uth elements of"
+                                " Op(crop_tensor) "
+                                "should be greater than or "
+                                "equal to 0.",
+                                offsets_vec[i],
+                                i));
+
     PADDLE_ENFORCE_LE(offsets_vec[i] + shape_vec[i],
                       x_dims[i],
                       errors::InvalidArgument(
diff --git a/paddle/phi/kernels/impl/lu_kernel_impl.h b/paddle/phi/kernels/impl/lu_kernel_impl.h
index 31a83ea540176284e0539188e517af5e3a50af20..5315e36b47172aa322c93ed8dddbfbd53071475c 100644
--- a/paddle/phi/kernels/impl/lu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_kernel_impl.h
@@ -520,6 +520,14 @@ DenseTensor Transpose2DTo6D(const Context& dev_ctx, const DenseTensor& x) {
   auto x_dim = x.dims();
   auto x_vec = phi::vectorize<int>(x_dim);
   int rank = x_vec.size();
+
+  for (int i = 0; i < x_dim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      x_dim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   std::swap(x_vec[rank - 1], x_vec[rank - 2]);
   std::vector<int> out_shape = x_vec;
   std::vector<int> axis(rank);
diff --git a/paddle/phi/kernels/onednn/matmul_kernel.cc b/paddle/phi/kernels/onednn/matmul_kernel.cc
index c820e738f09348b7d207dbd81e33fcb40b615d98..8f9baec36686ed68245d0a34d837821322526e9f 100644
--- a/paddle/phi/kernels/onednn/matmul_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -51,8 +51,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
                          const std::vector<int64_t> &y_dims,
                          std::vector<int64_t> *x_bd_dims,
                          std::vector<int64_t> *y_bd_dims,
-                         DenseTensor *out,
-                         const bool is_output_fused) {
+                         DenseTensor *out) {
   if (x_dims.size() == 1) {
     (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
   } else if (x_dims.size() == 2) {
@@ -74,7 +73,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
     }
   }
 
-  if (!is_output_fused && x_dims.size() > 2 && y_dims.size() > 2) {
+  if (x_dims.size() > 2 && y_dims.size() > 2) {
     auto out_dims = vectorize(out->dims());
     for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
       PADDLE_ENFORCE_EQ(
@@ -121,15 +120,6 @@ void MatmulKernel(const Context &dev_ctx,
           ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
           : false;
 
-  bool fuse_relu = false;
-  if (dev_ctx.HasDnnAttr("fuse_activation")) {
-    auto act_type =
-        PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"));
-    if (act_type == "relu" || act_type == "relu6") {
-      fuse_relu = true;
-    }
-  }
-
   auto x_dims = vectorize(GetDimsForInput(dev_ctx, x.dims(), "X"));
   auto y_dims = vectorize(GetDimsForInput(dev_ctx, y.dims(), "Y"));
 
@@ -139,12 +129,7 @@ void MatmulKernel(const Context &dev_ctx,
   std::vector<int64_t> x_bd_dims(ndims, 1);
   std::vector<int64_t> y_bd_dims(ndims, 1);
 
-  CalculateMatrixDims(x_dims,
-                      y_dims,
-                      &x_bd_dims,
-                      &y_bd_dims,
-                      out,
-                      funcs::IsOutputFused(dev_ctx));
+  CalculateMatrixDims(x_dims, y_dims, &x_bd_dims, &y_bd_dims, out);
 
   if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) {
     funcs::ExecuteMatmul<T, float>(
@@ -152,9 +137,6 @@ void MatmulKernel(const Context &dev_ctx,
   } else if (is_bfloat16) {
     funcs::ExecuteMatmul<T, paddle::platform::bfloat16>(
         dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
-  } else if (fuse_relu) {
-    funcs::ExecuteMatmul<T, uint8_t>(
-        dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
   } else {
     funcs::ExecuteMatmul<T, int8_t>(
         dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index 660d3b753e97ea449c0ba8d8d2e07d4829c1f5cb..c4c58c8342e600546e8b568ef9532ac4094bafc3 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -26,6 +26,11 @@ void MinKernel(const Context& dev_ctx,
                bool keep_dim,
                DenseTensor* out) {
   bool reduce_all = recompute_reduce_all(x, dims);
+  PADDLE_ENFORCE_GT(
+      x.numel(),
+      0,
+      errors::InvalidArgument("Zero-size tensor to reduction operation minimum "
+                              "which has no identity."));
   MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
 }
 
diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
index 3513b64bc600ebfe565dbb1bc69a3b25ac72c21c..ebf13142345cee3fce41f1208acfcb09b0827e18 100644
--- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
@@ -18,6 +18,7 @@
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
@@ -39,7 +40,15 @@ void ArgMaxKernel(const Context& dev_ctx,
           DataType::INT64,
           DataType::INT32,
           dtype));
+  // TODO(ZHUI): fix dtype of out
   dev_ctx.template Alloc<int64_t>(out);
+  if (x.dims().size() == 0) {
+    xpu::constant(dev_ctx.x_context(),
+                  out->data<int64_t>(),
+                  x.numel(),
+                  static_cast<int64_t>(0));
+    return;
+  }
 
   DDim x_dims;
   int axis_val = axis.to<int>();
diff --git a/paddle/phi/kernels/xpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/xpu/grid_sample_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86e78b4b15cf9e5c84d3d1d7ba23b662b4ab0bad
--- /dev/null
+++ b/paddle/phi/kernels/xpu/grid_sample_grad_kernel.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grid,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  PADDLE_ENFORCE_EQ(
+      x.dims().size(),
+      4,
+      phi::errors::InvalidArgument(
+          ("XPU is only support input_dims == 4 in grid_sample_grad op.")));
+
+  const int64_t n = grid.dims()[0];
+  const int64_t out_h = grid.dims()[1];
+  const int64_t out_w = grid.dims()[2];
+  const int64_t c = x.dims()[1];
+  const int64_t in_h = x.dims()[2];
+  const int64_t in_w = x.dims()[3];
+
+  x_grad->Resize({n, c, in_h, in_w});
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+
+  T* grid_grad_ptr = nullptr;
+  if (grid_grad != nullptr) {
+    grid_grad->Resize({n, out_h, out_w, 2});
+    grid_grad_ptr = dev_ctx.template Alloc<T>(grid_grad);
+  }
+
+  bool is_nearest = false;
+  if (mode == "nearest") {
+    is_nearest = true;
+  }
+  int64_t padding_mode_type = 0;
+  if (padding_mode == "border") {
+    padding_mode_type = 1;
+  } else if (padding_mode == "reflection") {
+    padding_mode_type = 2;
+  }
+
+  int r = xpu::grid_sample_grad<T>(dev_ctx.x_context(),
+                                   x.data<T>(),
+                                   grid.data<T>(),
+                                   out_grid.data<T>(),
+                                   x_grad_ptr,
+                                   grid_grad_ptr,
+                                   n,
+                                   c,
+                                   in_h,
+                                   in_w,
+                                   out_h,
+                                   out_w,
+                                   is_nearest,
+                                   align_corners,
+                                   padding_mode_type,
+                                   true);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "grid_sample_grad");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample_grad, XPU, ALL_LAYOUT, phi::GridSampleGradKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/p_norm_kernel.cc b/paddle/phi/kernels/xpu/p_norm_kernel.cc
index 7ef72c61ad3aa11ee279e2bc7fcd1839068d5b09..60abc59517b786d757945c421cc0feb73243caa2 100644
--- a/paddle/phi/kernels/xpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/p_norm_kernel.cc
@@ -55,6 +55,14 @@ void PNormKernel(const Context& dev_ctx,
   int n = 1;
   int t = 1;
   GetDims(xdim, axis, &m, &t, &n, asvector);
+
+  for (int i = 0; i < xdim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      xdim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   x_dim.push_back(m);
   x_dim.push_back(t);
   x_dim.push_back(n);
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index acc036d393ff0dc19f66d545d325c5d2d699f723..75bf5b11f71237e46d8bd76a2c3b7b619a3aa7e0 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -66,51 +66,6 @@ KernelSignature Relu6OpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("relu6_raw", {"X"}, {"threshold"}, {"Out"});
 }
 
-KernelSignature PowOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("FactorTensor")) {
-    return KernelSignature("pow", {"X"}, {"FactorTensor"}, {"Out"});
-  } else {
-    return KernelSignature("pow", {"X"}, {"factor"}, {"Out"});
-  }
-}
-
-KernelSignature PowGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("FactorTensor")) {
-    return KernelSignature(
-        "pow_grad", {"X", "Out@GRAD"}, {"FactorTensor"}, {"X@GRAD"});
-  } else {
-    return KernelSignature(
-        "pow_grad", {"X", "Out@GRAD"}, {"factor"}, {"X@GRAD"});
-  }
-}
-
-KernelSignature PowDoubleGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("FactorTensor")) {
-    return KernelSignature("pow_double_grad",
-                           {"X", "DOut", "DDX"},
-                           {"FactorTensor"},
-                           {"DX", "DDOut"});
-  } else {
-    return KernelSignature(
-        "pow_double_grad", {"X", "DOut", "DDX"}, {"factor"}, {"DX", "DDOut"});
-  }
-}
-
-KernelSignature PowTripleGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("FactorTensor")) {
-    return KernelSignature("pow_triple_grad",
-                           {"X", "DOut", "DDX", "D_DX", "D_DDOut"},
-                           {"FactorTensor"},
-                           {"D_X", "D_DOut", "D_DDX"});
-  } else {
-    return KernelSignature("pow_triple_grad",
-                           {"X", "DOut", "DDX", "D_DX", "D_DDOut"},
-                           {"factor"},
-                           {"D_X", "D_DOut", "D_DDX"});
-  }
-}
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(hard_swish, hardswish);
@@ -126,9 +81,3 @@ PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad,
 PD_REGISTER_ARG_MAPPING_FN(hard_swish, phi::HardSwishOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(swish, phi::SwishOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(pow_grad, phi::PowGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(pow_double_grad,
-                           phi::PowDoubleGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(pow_triple_grad,
-                           phi::PowTripleGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(pow, phi::PowOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
deleted file mode 100644
index d0fcbb33be2a752f16a1de99928935180cd9d3da..0000000000000000000000000000000000000000
--- a/paddle/phi/ops/compat/broadcast_tensors_sig.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature BroadcastTensorsGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "broadcast_tensors_grad", {"Out@GRAD"}, {}, {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad,
-                           phi::BroadcastTensorsGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fused_matmul_sig.cc b/paddle/phi/ops/compat/fused_matmul_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18e3eb52b803c342366cda57a79d85dc8303a1c4
--- /dev/null
+++ b/paddle/phi/ops/compat/fused_matmul_sig.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FusedMatmulOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_matmul",
+                         {"X", "Y", "ResidualData"},
+                         {"trans_x",
+                          "trans_y",
+                          "matmul_alpha",
+                          "fuse_activation",
+                          "fuse_alpha",
+                          "fuse_beta",
+                          "fused_output_scale",
+                          "fused_reshape_X",
+                          "fused_transpose_X",
+                          "fused_reshape_Y",
+                          "fused_transpose_Y",
+                          "fused_reshape_Out",
+                          "fused_transpose_Out",
+                          "mkldnn_data_type",
+                          "Scale_x",
+                          "Scale_y",
+                          "Scale_in_eltwise",
+                          "Scale_out",
+                          "force_fp32_output"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(fused_matmul, phi::FusedMatmulOpArgumentMapping);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4c48154b80a4b21a063af640b122e5b1b296284e..d1e5afed972d9547fd9802e8e44d1e105798b957 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -459,7 +459,7 @@ EOF
         if ls ${PADDLE_ROOT}/build/python/dist/*whl >/dev/null 2>&1; then
             PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}')
         elif ls ${PADDLE_ROOT}/dist/*whl >/dev/null 2>&1; then
-            PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}')
+            PR_whlSize=$($com ${PADDLE_ROOT}/dist |awk '{print $1}')
         fi
         echo "PR whl Size: $PR_whlSize"
         echo "ipipe_log_param_PR_whl_Size: $PR_whlSize" >> ${PADDLE_ROOT}/build/build_summary.txt
@@ -1326,6 +1326,17 @@ function card_test() {
     cardnumber=$2
     parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
 
+    # run ut based on the label
+    if [[ "${UT_RUN_TYPE_SETTING}" == "INFER" ]];then
+        run_label_mode="-L (RUN_TYPE=INFER)"
+    elif [[ "${UT_RUN_TYPE_SETTING}" == "DIST" ]];then
+        run_label_mode="-L (RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE)"
+    elif [[ "${UT_RUN_TYPE_SETTING}" == "WITHOUT_INFER" ]];then
+        run_label_mode="-LE (RUN_TYPE=INFER)"
+    elif [[ "${UT_RUN_TYPE_SETTING}" == "OTHER" ]];then
+        run_label_mode="-LE (RUN_TYPE=INFER|RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE)"
+    fi
+
     # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
@@ -1375,15 +1386,15 @@ function card_test() {
         tmpfile=$tmp_dir/$tmpfile_rand"_"$i
         if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" -V --timeout 120 -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} -V --timeout 120 -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         else
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         fi
     done
@@ -2364,7 +2375,7 @@ set +x
                             if [[ "${failed_test_lists}" == "" ]];then
                                 break
                             else
-                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                                retry_unittests=$( echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                             fi
                         fi
                         echo "========================================="
@@ -2687,10 +2698,10 @@ set +x
             if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
                 bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
             fi
-            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
             need_retry_ut_arr=(${need_retry_ut_str})
             need_retry_ut_count=${#need_retry_ut_arr[@]}
-            read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
             while ( [ $exec_times -lt $retry_time ] )
                 do
                     if [[ "${exec_times}" == "0" ]] ;then
@@ -2700,7 +2711,7 @@ set +x
                             is_retry_execuate=1
                         fi
                     elif [[ "${exec_times}" == "1" ]] ;then
-                        read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                         need_retry_ut_arr=(${need_retry_ut_str})
                         need_retry_ut_count=${#need_retry_ut_arr[@]}
                         if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
@@ -2718,7 +2729,7 @@ set +x
                             if [[ "${failed_test_lists}" == "" ]];then
                                 break
                             else
-                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                             fi
                         fi
                         echo "========================================="
@@ -3470,10 +3481,7 @@ function build_pr_and_develop() {
     generate_api_spec "$1" "PR"
     mkdir ${PADDLE_ROOT}/build/pr_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl
     rm -f ${PADDLE_ROOT}/build/python/dist/*.whl && rm -f ${PADDLE_ROOT}/build/python/build/.timestamp
-    if [[ ${cmake_change} ]];then
-        rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
-        rm -rf ${PADDLE_ROOT}/build/third_party
-    fi
+
 
     git fetch upstream develop
     git checkout develop
@@ -3484,6 +3492,10 @@ function build_pr_and_develop() {
         mkdir ${PADDLE_ROOT}/build/dev_whl && wget -q -P ${PADDLE_ROOT}/build/dev_whl ${dev_url}
         cp ${PADDLE_ROOT}/build/dev_whl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl ${PADDLE_ROOT}/build/python/dist
     else
+        if [[ ${cmake_change} ]];then
+            rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
+            rm -rf ${PADDLE_ROOT}/build/third_party
+        fi
         git checkout -b develop_base_pr upstream/$BRANCH
         run_setup ${PYTHON_ABI:-""} bdist_wheel ${parallel_number} 
         if [ ! -d "${PADDLE_ROOT}/build/python/dist/" ]; then
@@ -3752,6 +3764,8 @@ function run_setup(){
         exit 7;
     fi
 
+    build_size
+    
     endTime_s=`date +%s`
     [ -n "$startTime_firstBuild" ] && startTime_s=$startTime_firstBuild
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
@@ -3947,12 +3961,20 @@ function run_setup_mac(){
 
     ccache -z
     cd ..
+    
     if [ "${PYTHON_EXECUTABLE}" != "" ];then
-        ${PYTHON_EXECUTABLE} setup.py $2;build_error=$?
+        if [ "$SYSTEM" == "Darwin" ]; then
+            ${PYTHON_EXECUTABLE} setup.py $2 --plat-name=macosx_10_9_x86_64;build_error=$?
+        else
+            ${PYTHON_EXECUTABLE} setup.py $2;build_error=$?
+        fi
     else
-        python setup.py $2;build_error=$?
+        if [ "$SYSTEM" == "Darwin" ]; then
+            python setup.py $2 --plat-name=macosx_10_9_x86_64;build_error=$?
+        else
+            python setup.py $2;build_error=$?
+        fi
     fi
-    
     # ci will collect ccache hit rate
     collect_ccache_hits
 
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index b6a38e0e28589b072814cf0c7f494cb684d26e6f..db1ad4d5774aad83c8513346873898fd1f3aa883 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -228,9 +228,16 @@ class AmpScaler:
 
         optimize_ops, params_grads = (None, None)
 
-        optimizer._set_auxiliary_var('found_inf', self._found_inf)
-        optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
-        self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+        if hasattr(optimizer, "_set_auxiliary_var"):
+            optimizer._set_auxiliary_var('found_inf', self._found_inf)
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+        else:
+            if self._found_inf:
+                self._cache_founf_inf = True
+            else:
+                optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+                self._cache_founf_inf = False
 
         if self._use_dynamic_loss_scaling:
             # uopdate the scale
@@ -771,9 +778,16 @@ class GradScaler(AmpScaler):
         if optimizer_state["state"] is OptimizerState.INIT:
             self._unscale(optimizer)
 
-        optimizer._set_auxiliary_var('found_inf', self._found_inf)
-        optimizer.step()
-        self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+        if hasattr(optimizer, "_set_auxiliary_var"):
+            optimizer._set_auxiliary_var('found_inf', self._found_inf)
+            optimizer.step()
+            self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+        else:
+            if self._found_inf:
+                self._cache_founf_inf = True
+            else:
+                optimizer.step()
+                self._cache_founf_inf = False
 
         optimizer_state["state"] = OptimizerState.STEPPED
 
diff --git a/python/paddle/common_ops_import.py b/python/paddle/common_ops_import.py
index 91a3f49cdbba2a4d231ace286e81d7df5bf84d61..1ec54064eb64e3dd4b1f352a84ce3a5ba742e6b7 100644
--- a/python/paddle/common_ops_import.py
+++ b/python/paddle/common_ops_import.py
@@ -32,7 +32,6 @@ from paddle.fluid.framework import (  # noqa: F401
     dygraph_only,
     in_dygraph_mode,
 )
-from paddle.fluid.initializer import Constant  # noqa: F401
 from paddle.fluid.layer_helper import LayerHelper  # noqa: F401
 from paddle.fluid.layers import fill_constant, utils  # noqa: F401
 from paddle.fluid.layers.layer_function_generator import (  # noqa: F401
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index defb6321847c249467dfb0ae1dae93e3348103c2..d3bcd56db7415ffbdade9b4815c89e7bd2a8f1bd 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -540,6 +540,8 @@ class Event(object):
         .. code-block:: python
             # required: custom_device
             import paddle
+
+            paddle.set_device('custom_cpu')
             e1 = paddle.device.Event()
             e2 = paddle.device.Event('custom_cpu')
             e3 = paddle.device.Event('custom_cpu:0')
@@ -593,6 +595,8 @@ class Event(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 e = paddle.device.Event()
                 e.record()
 
@@ -613,7 +617,10 @@ class Event(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 e = paddle.device.Event()
+                e.record()
                 e.query()
         '''
         return self.event_base.query()
@@ -628,8 +635,13 @@ class Event(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 e1 = paddle.device.Event()
+                e1.record()
+
                 e2 = paddle.device.Event()
+                e2.record()
                 e1.elapsed_time(e2)
         '''
         return 0
@@ -645,7 +657,10 @@ class Event(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 e = paddle.device.Event()
+                e.record()
                 e.synchronize()
         '''
         self.event_base.synchronize()
@@ -670,6 +685,8 @@ class Stream(object):
         .. code-block:: python
             # required: custom_device
             import paddle
+
+            paddle.set_device('custom_cpu')
             s1 = paddle.device.Stream()
             s2 = paddle.device.Stream('custom_cpu')
             s3 = paddle.device.Stream('custom_cpu:0')
@@ -727,9 +744,13 @@ class Stream(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
-                s = paddle.device.Stream()
+
+                paddle.set_device('custom_cpu')
+                s1 = paddle.device.Stream()
+                s2 = paddle.device.Stream()
                 e = paddle.device.Event()
-                s.wait_event(e)
+                e.record(s1)
+                s2.wait_event(e)
         '''
         self.stream_base.wait_event(event.event_base)
 
@@ -746,6 +767,8 @@ class Stream(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 s1 = paddle.device.Stream()
                 s2 = paddle.device.Stream()
                 s1.wait_stream(s2)
@@ -764,6 +787,8 @@ class Stream(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 s = paddle.device.Stream()
                 e1 = s.record_event()
 
@@ -784,6 +809,8 @@ class Stream(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 s = paddle.device.Stream()
                 s.query()
         '''
@@ -798,6 +825,8 @@ class Stream(object):
             .. code-block:: python
                 # required: custom_device
                 import paddle
+
+                paddle.set_device('custom_cpu')
                 s = paddle.device.Stream()
                 s.synchronize()
         '''
@@ -837,8 +866,10 @@ def current_stream(device=None):
         .. code-block:: python
             # required: custom_device
             import paddle
+
+            paddle.set_device('custom_cpu')
             s1 = paddle.device.current_stream()
-            s2 = paddle.device.current_stream("gpu:0")
+            s2 = paddle.device.current_stream("custom_cpu:0")
             place = paddle.CustomPlace('custom_cpu', 0)
             s3 = paddle.device.current_stream(place)
     '''
@@ -878,6 +909,8 @@ def set_stream(stream):
         .. code-block:: python
             # required: custom_device
             import paddle
+
+            paddle.set_device('custom_cpu')
             s = paddle.device.Stream()
             paddle.device.set_stream(s)
     '''
@@ -917,6 +950,8 @@ class stream_guard(object):
         .. code-block:: python
             # required: custom_device
             import paddle
+
+            paddle.set_device('custom_cpu')
             s = paddle.device.Stream()
             data1 = paddle.ones(shape=[20])
             data2 = paddle.ones(shape=[20])
@@ -967,8 +1002,10 @@ def synchronize(device=None):
         .. code-block:: python
             # required: custom_device
             import paddle
+
+            paddle.set_device('custom_cpu')
             paddle.device.synchronize()
-            paddle.device.synchronize("gpu:0")
+            paddle.device.synchronize("custom_cpu:0")
             place = paddle.CustomPlace('custom_cpu', 0)
             paddle.device.synchronize(place)
     '''
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 8960c47c1f5bf23e1bbe3d601e2a839350a7b416..8979239df5f11c7344e1127fba5e3c53fa709830 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -1850,11 +1850,11 @@ class Completer:
                             op_dist_attr.set_output_dims_mapping(
                                 input_var.name, ref_dims_mapping
                             )
-
-                        input_var_attr.process_mesh = ref_process_mesh
-                        self._dist_context.set_tensor_dist_attr_for_program(
-                            input_var, input_var_attr
-                        )
+                        if "SkipUpdate" not in input_name:
+                            input_var_attr.process_mesh = ref_process_mesh
+                            self._dist_context.set_tensor_dist_attr_for_program(
+                                input_var, input_var_attr
+                            )
 
                     self._dist_context.set_op_dist_attr_for_program(
                         op, op_dist_attr
diff --git a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
index f1508f793a92e66d36c1aa92f3836c18d2303ed3..cdfc87868c546189c28cdedbb2ae2661b2839b49 100644
--- a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
@@ -516,6 +516,11 @@ class ClusterPartitionUtil:
 
     @staticmethod
     def complete_meshes(partitions: list, num: int):
+        if num == 2:
+            return [[1, 2], [2, 1]]
+        if num == 3:
+            return [[1, 2], [2, 1], [1]]
+        # special cases
         if len(partitions) == 1:
             partitions = ClusterPartitionUtil.factorization(num - 1)
             partitions.append([1])
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index efa94862b5246876f682f93e3ea1b417123c677c..fbe391b45f0559bf477d6e688c55e5accc6b807c 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -104,7 +104,6 @@ class DistributedJobInfo:
         self.job_info.strategy = dist_strategy
 
 
-ReduceStrategyFluid = paddle.static.BuildStrategy.ReduceStrategy
 ReduceStrategyFleet = int
 
 
@@ -261,7 +260,7 @@ class DistributedStrategy:
         for f in fields:
             value = getattr(self.strategy.build_strategy, f.name)
             if f.name == 'reduce_strategy':
-                value = ReduceStrategyFluid(value)
+                value = paddle.static.BuildStrategy.ReduceStrategy(value)
             setattr(build_strategy, f.name, value)
         return build_strategy
 
diff --git a/python/paddle/distributed/fleet/layers/mpu/random.py b/python/paddle/distributed/fleet/layers/mpu/random.py
index 7b89330d951c87dcce7276a0fa0f6e672d64a38d..718c85e855734e7814191b1db3c198cc46058a45 100644
--- a/python/paddle/distributed/fleet/layers/mpu/random.py
+++ b/python/paddle/distributed/fleet/layers/mpu/random.py
@@ -18,11 +18,11 @@ import numpy as np
 
 import paddle
 from paddle import _legacy_C_ops
+from paddle.common_ops_import import Variable
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.framework import LayerHelper
-from paddle.static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 9dce0d540a16f5aebcc00c730c87fa9fc0ccf28a..98d131822fe3639f48acbff53907f937a49d8605 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -171,7 +171,7 @@ class DGCMomentumOptimizer(Optimizer):
         if is_new_var:
             helper.set_variable_initializer(
                 counter,
-                initializer=paddle.fluid.initializer.Constant(
+                initializer=paddle.nn.initializer.ConstantInitializer(
                     value=float(begin - 1), force_cpu=True
                 ),
             )
@@ -194,7 +194,7 @@ class DGCMomentumOptimizer(Optimizer):
         if is_new_var:
             helper.set_variable_initializer(
                 counter,
-                initializer=paddle.fluid.initializer.Constant(
+                initializer=paddle.nn.initializer.ConstantInitializer(
                     value=float(value), force_cpu=True
                 ),
             )
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index c12843f106562c7167978cbccdda8101d198a61f..4924d523ded05a45e8e9e25e980b7606ff45a048 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -41,9 +41,16 @@ class HybridParallelGradScaler:
 
         optimize_ops, params_grads = (None, None)
 
-        optimizer._set_auxiliary_var('found_inf', self._found_inf)
-        optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
-        self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+        if hasattr(optimizer, "_set_auxiliary_var"):
+            optimizer._set_auxiliary_var('found_inf', self._found_inf)
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+        else:
+            if self._found_inf:
+                self._cache_founf_inf = True
+            else:
+                optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+                self._cache_founf_inf = False
 
         if self._use_dynamic_loss_scaling:
             self._update()
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 999ab6f0af126f489a57aba806ec00763d7e062a..d2f72b0c7d0479a62171208668a2851fb091ceba 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -18,7 +18,7 @@ import math
 import numpy as np
 
 import paddle
-from paddle.static import Variable
+from paddle.common_ops_import import Variable
 
 __all__ = []
 
diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
index 70a38ff3b9aa663f8b6b21c85ba7e1677310f109..54c486a4d2be50246db55d204603574269a37bf7 100755
--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -453,13 +453,12 @@ def recompute(function, *args, **kwargs):
 
 def recompute_sequential(ctx, functions, *args, **kwargs):
     """
-    recompute intermediate activations to save then memory for 'Sequential' models.
+    recompute intermediate activations to save the memory for 'Sequential' models. use 'ctx' to transmit some context params, it is similar to 'recompute_hybrid' API.
 
     Parameters:
         ctx(dict): include 'segments' and  'preserve_rng_state' keys, the key 'segments' (int, default 1), represents the number of chunks to create in the model,
                    the key 'preserve_rng_state' (bool, optional, default=True) indicate whether to save the forward rng. If it is True, then the last forward rng value will be
-                   restored when the forward recalculation of backpropagation is performed. and some keys such as 'mp_group', 'offload' and 'partition' are invalid here,
-                   they are useful in 'recompute_hybrid' API.
+                   restored when the forward recalculation of backpropagation is performed.
         functions(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
               whose intermediate activations will be released to save memory in forward stage and will be recomputed
               in backward stage for gradient calculation.
diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
index 781f44e4061affd78b0b6e7fe8005a66bcca6a9d..44faccf9dd42e751377692d9a315570ab80305b9 100644
--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -246,6 +246,7 @@ class _HPRecomputeFunction(PyLayer):
 
 def recompute_hybrid(ctx, function, *args, **kwargs):
     """
+    recompute intermediate activations to save the memory in hybrid parallel scene.
     # NODTE(shenliang03)The current hybrid parallel recompute has limitations.
     # It cannot handle the following situations:
     # 1. The calculation output of recompute, there are tensors that do not require gradients.
@@ -255,8 +256,7 @@ def recompute_hybrid(ctx, function, *args, **kwargs):
     Parameters:
         ctx(dict): include 'mp_group', 'offload', and 'partition' keys. the key 'mp_group' (Group), represents the avtivations are splitted
                    in which group. the key 'offload' (bool, optional, default=False), represents whether to offload to cpu. the key 'partition' (bool, optional, default=False),
-                   represents whether to split activations in the mp_group. and some keys such as 'segments' and 'preserve_rng_state' are invalid here, they are useful in
-                   'recompute_sequential' API.
+                   represents whether to split activations in the mp_group.
         function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
               whose intermediate activations will be released to save memory in forward stage and will be recomputed
               in backward stage for gradient calculation.
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index a9d2a0454257a200497054ca4d3869a5bc5f41c0..c13908ba62d2e920f0614d9d537f5455357c5ce8 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -27,18 +27,20 @@ __all__ = ["LocalFS", "recompute", "DistributedInfer", "HDFSClient"]  # noqa
 
 def recompute(function, *args, **kwargs):
     """
-    recompute intermediate activations to save then memory.
+    recompute intermediate activations to save the memory.
 
     Parameters:
         function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
               whose intermediate activations will be released to save memory in forward stage and will be recomputed
               in backward stage for gradient calculation.
         *args(Tensor): inputs to the function.
-        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
-              indicate whether to save the forward rng. If it is True, then the last forward rng value will be
-              restored when the forward recalculation of backpropagation is performed. The default
-              preserve_rng_state is True.
-
+        **kwargs(Dict): Kwargs should only contain two kinds of key-value params, the one is part of function's key-value params,
+                        and the other contains ``preserve_rng_state`` and ``use_reentrant``. the key-value pair of ``preserve_rng_state``,
+                        which is used to indicate whether to save the forward rng. If it is True, then the last forward rng value
+                        will be restored when the forward recalculation of backpropagation is performed, its default value is True.
+                        the key-value pair of ``use_reentrant`` is used to indicate which implementation of recompute you will be used.
+                        ``use_reentrant=True`` means to use the PyLayer implementation of recompute, ``use_reentrant=False`` means to
+                        use the Hook implementation of recompute, its default value is True.
     Returns:
         Output of function on args.
 
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 2a60b0df5f5eb2bf1c6a81636a3693036e1bb6e6..f25ede7f05ee57b634eae505eb9c71d98f39dbaa 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -464,7 +464,7 @@ class DistributedOpsPass(PassBase):
                     "is_sparse": True,
                 },
             )
-            PSGPU = paddle.fluid.core.PSGPU()
+            PSGPU = core.PSGPU()
             try:
                 gpu_slot = [int(var.name) for var in gpups_inputs]
             except (ValueError):
@@ -1052,7 +1052,7 @@ class SplitHeterWorkerOpsPass(PassBase):
         block_vars_detail = find_block_joints(
             program, program_block_ops, heter_ops
         )
-        heter_program = framework.Program()
+        heter_program = paddle.framework.Program()
         self._create_heter_program(
             program,
             attrs,
@@ -1628,13 +1628,13 @@ class SplitFlOpsPass(PassBase):
         debug_program(_main_file, prog_b)
 
         if not self.is_part_b:
-            self.partA_program = framework.Program()
+            self.partA_program = paddle.framework.Program()
             self._get_partA_program(prog_a.global_block())
             pass_ctx._attrs['part_a_main_program'] = self.partA_program
             self._clear_op_device_flag(self.partA_program)
             check_program(self.partA_program)
         else:
-            self.partB_program = framework.Program()
+            self.partB_program = paddle.framework.Program()
             self._get_partB_program(prog_b.global_block())
             pass_ctx._attrs['part_b_main_program'] = self.partB_program
             self._clear_op_device_flag(self.partB_program)
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 5169f9f085fe24d7611fd5181bc00366782f72df..1ba11e1fba4c3fecfa9d9e20594bf039ffe50d3a 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1337,6 +1337,18 @@ def _append_backward_ops_(
         rename_var_map = {}
     assert isinstance(rename_var_map, dict)
 
+    if core._is_bwd_prim_enabled():
+        composite_block = program.clone().current_block()
+        # Infer shape for operators whose output haven't been created.
+        for op in composite_block.ops:
+            if not all(
+                tuple(
+                    composite_block._find_var_recursive(arg)
+                    for arg in op.output_arg_names
+                )
+            ):
+                infershape_for_composite(composite_block, op.desc)
+
     # add grad_op_desc by reversed ops
     for op in reversed(ops):
         grad_sub_block_list = []
@@ -1365,11 +1377,42 @@ def _append_backward_ops_(
 
             program._rollback()
             grad_sub_block_list.append(grad_sub_block.desc)
+        # In primitive mode, raw phi GradOp will be split into multiple small
+        # primitive operators, and the split rules are defined in c++ level,
+        # see detials: paddle/fluid/prim/api/manual/backward/composite_backward_api.h
+        # It means that the output's shape and dtype of previous operators which
+        # maybe used as the input of next operators must be known. Therefore,
+        # we infer shape and dtype in a sandbox block(named composite_block) for
+        # used in c++ level.
+        # For example:
+        #   forward:
+        #       z = multiply(x, y) //maybe broadcast in kernel
+        #   bcckward:
+        #       x_grad_unreduce = z_grad * y // maybe unreduce
+        #       reduced_axes = get_reduced_axes(x_grad.shape, x.shape) // need known shape
+        #       x_grad = reduce_sum(x_grad_unreduce)
+        grad_op_desc = []
+        op_grad_to_var = {}
+        if core._is_bwd_prim_enabled():
+
+            def find_op_index(block_desc, cur_op_desc):
+                for idx in range(block_desc.op_size()):
+                    if cur_op_desc == block_desc.op(idx):
+                        return idx
+                return -1
 
-        # Getting op's corresponding grad_op
-        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, no_grad_dict[block.idx], grad_sub_block_list
-        )
+            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+                composite_block.desc.op(find_op_index(block.desc, op.desc)),
+                no_grad_dict[composite_block.idx],
+                grad_sub_block_list,
+            )
+            for desc in grad_op_desc:
+                infershape_for_composite(composite_block, desc)
+        else:
+            # Getting op's corresponding grad_op
+            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+                op.desc, no_grad_dict[block.idx], grad_sub_block_list
+            )
 
         # record the mapping between fwd and bwd
         if grad_op_id_to_fwd_op is not None:
@@ -1655,7 +1698,43 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
         block.desc._remove_op(op_idx, op_idx + 1)
 
 
-def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
+def infershape_for_composite(block, grad_op_desc):
+    # pruning empty output
+    if len(grad_op_desc.output_arg_names()) == 0:
+        return
+
+    # append op to block
+    op_desc = block.desc.append_op()
+    op_desc.copy_from(grad_op_desc)
+    op_desc._set_attr(
+        core.op_proto_and_checker_maker.kOpRoleAttrName(),
+        core.op_proto_and_checker_maker.OpRole.Backward,
+    )
+
+    # create output var
+    new_vars = set()
+    # create new gradient variables
+    for grad_var_name in op_desc.output_arg_names():
+        if not (
+            block.desc.has_var_recursive(grad_var_name.encode())
+            or grad_var_name == core.empty_var_name()
+        ):
+            block.desc.var(grad_var_name.encode())
+            new_vars.add(grad_var_name)
+
+    # infer shape and infer dthype
+    op_desc.check_attrs()
+    op_desc.infer_var_type(block.desc)
+    op_desc.infer_shape(block.desc)
+
+    for arg in op_desc.output_arg_names():
+        if arg in new_vars:
+            _infer_var_data_type_shape_(arg, block)
+
+
+def _rename_grad_(
+    block, start_op_idx, grad_to_var, target_grad_map, skip_rename_var_list
+):
     var_map = copy.copy(target_grad_map)
     for op_idx in range(start_op_idx, block.desc.op_size()):
         op_desc = block.desc.op(op_idx)
@@ -1667,6 +1746,8 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
             if "@GRAD" not in name:
                 continue
             if block.desc.find_var(name.encode("ascii")):
+                if name in skip_rename_var_list:
+                    continue
                 new_name = unique_name.generate(name)
                 op_desc._rename_output(name, new_name)
                 var_map[name] = new_name
@@ -1993,7 +2074,7 @@ def append_backward(
     # Because append_backward may be called multiple times,
     # we need rename the internal gradient variables so that they have
     # different names.
-    _rename_grad_(target_grad_block, fwd_op_num, grad_to_var, {})
+    _rename_grad_(target_grad_block, fwd_op_num, grad_to_var, {}, [])
 
     _append_backward_vars_(
         target_grad_block, fwd_op_num, grad_to_var, grad_info_map
@@ -2297,33 +2378,24 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
 
     target_grad_map = {}
     rename_var_map = {}
+    skip_rename_var_list = []
     for i, grad in enumerate(target_gradients):
         target = targets[i]
         grad_name = _append_grad_suffix_(target.name)
         if grad is None:
-            target_shape = target.name + '_shape'
-            block.desc.append_op().copy_from(
-                _create_op_desc_(
-                    "shape",
-                    {'Input': [target.name]},
-                    {"Out": [target_shape]},
-                    {},
-                )
-            )
-            input_grad_names_set.add(target_shape)
             op_desc = _create_op_desc_(
-                "fill_constant",
-                {"ShapeTensor": [target_shape]},
+                "fill_any_like",
+                {"X": [target.name]},
                 {"Out": [grad_name]},
                 {
-                    "shape": target.shape,
                     "value": 1.0,
                     "dtype": target.dtype,
                 },
             )
-
             block.desc.append_op().copy_from(op_desc)
+            block.program._sync_with_cpp()
             input_grad_names_set.add(grad_name)
+            skip_rename_var_list.append(grad_name)
         else:
             if target.block.idx != block_idx or target.block.program != prog:
                 raise ValueError("all targets must be in the same block")
@@ -2336,6 +2408,9 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
             input_grad_names_set.add(grad.name)
             rename_var_map[grad_name] = grad.name
 
+    if core._is_bwd_prim_enabled():
+        core._set_prim_target_grad_name(target_grad_map)
+
     # For double backward, input_grad_names is used for filter
     # some non-used gradients op. rename_var_map is used to
     # associate target_grad var name with first grad_op input name.
@@ -2378,7 +2453,9 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     # Because calc_gradient may be called multiple times,
     # we need rename the internal gradient variables so that they have
     # different names.
-    _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
+    _rename_grad_(
+        block, fwd_op_num, grad_to_var, target_grad_map, skip_rename_var_list
+    )
 
     _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
     prog._sync_with_cpp()
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index e8393c63b1053601957c7d53f1b483afcc33f656..609bfa3d93e53dc31ef576c9d13f0dc191e26d23 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -586,7 +586,6 @@ class IpuDynamicPatcher:
         """
         from ..fluid.dygraph.base import switch_to_static_graph
         from ..fluid import backward
-        from ..fluid.initializer import Constant
         from ..fluid.framework import device_guard
         import paddle
 
@@ -645,7 +644,10 @@ class IpuDynamicPatcher:
                     device = optimizer._get_device_for_param(param_name)
                     with device_guard(device):
                         optimizer.helper.set_variable_initializer(
-                            var, initializer=Constant(value=0.0)
+                            var,
+                            initializer=paddle.nn.initializer.Constant(
+                                value=0.0
+                            ),
                         )
                     param_or_lr_tensor = scope.find_var(
                         var_tmp.name
diff --git a/python/paddle/fluid/contrib/layers/metric_op.py b/python/paddle/fluid/contrib/layers/metric_op.py
index 414fcf5b6cd51c757edaf7213faf0c38008ca107..07d6b464ddb114e15773943eab0610a5101fb025 100755
--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ b/python/paddle/fluid/contrib/layers/metric_op.py
@@ -17,7 +17,6 @@ Contrib layers just related to metric.
 
 import warnings
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.initializer import Normal, Constant
 from paddle.fluid.framework import Variable
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layers import tensor
@@ -147,7 +146,10 @@ def ctr_metric_bundle(input, label, ins_tag_weight=None):
         local_ins_num,
     ]:
         helper.set_variable_initializer(
-            var, Constant(value=0.0, force_cpu=True)
+            var,
+            paddle.nn.initializer.ConstantInitializer(
+                value=0.0, force_cpu=True
+            ),
         )
 
     helper.append_op(
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index b836dfa451c33b57c873f5fcc019b40985d005d4..9064e4f9f09dd62f044d03daef7664a14e0fac7e 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -24,7 +24,6 @@ import paddle
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import utils
 from ... import unique_name
-from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.fluid.data_feeder import (
     check_variable_and_dtype,
     check_type,
@@ -793,6 +792,9 @@ def sparse_embedding(
         'paddle.static.nn.sparse_embedding',
     )
 
+    if input.size == 0:
+        raise ValueError("input size should not be 0")
+
     w = helper.create_parameter(
         attr=helper.param_attr,
         shape=size,
@@ -893,8 +895,10 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
 
     Examples:
         .. code-block:: python
+        import paddle
         import paddle.fluid as fluid
         import numpy as np
+        paddle.enable_static()
         x = fluid.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
         tree_info = [[0,0,0,1,2],
                      [0,1,0,3,4],[0,1,0,5,6],
@@ -905,7 +909,7 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
         child_nums = 2
         child, leaf_mask  = fluid.contrib.layers.tdm_child(x, node_nums, child_nums,
                                 param_attr=fluid.ParamAttr(
-                                    initializer=fluid.initializer.NumpyArrayInitializer(
+                                    initializer=paddle.nn.initializer.Assign(
                                                                             tree_info_np)))
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -922,7 +926,7 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
         attr=helper.param_attr,
         shape=[node_nums, 3 + child_nums],
         dtype=dtype,
-        default_initializer=Constant(0),
+        default_initializer=paddle.nn.initializer.Constant(0),
     )
     tree_info.stop_gradient = True
 
@@ -1000,8 +1004,10 @@ def tdm_sampler(
 
     Examples:
         .. code-block:: python
+        import paddle
         import paddle.fluid as fluid
         import numpy as np
+        paddle.enable_static()
         x = fluid.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
         travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path, shape(leaf_node_num, layer_num)
         layer_list_flat = [[1], [2], [3], [4], [5], [6]] # shape(node_nums, 1)
@@ -1019,10 +1025,10 @@ def tdm_sampler(
             layer_node_num_list,
             leaf_node_num,
             tree_travel_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     travel_array)),
             tree_layer_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     layer_array)),
             output_positive=True,
             output_list=True,
@@ -1086,7 +1092,7 @@ def tdm_sampler(
         attr=tree_travel_attr,
         shape=travel_shape,
         dtype=tree_dtype,
-        default_initializer=Constant(0),
+        default_initializer=paddle.nn.initializer.Constant(0),
     )
 
     layer_shape = [node_nums, 1]
@@ -1094,7 +1100,7 @@ def tdm_sampler(
         attr=tree_layer_attr,
         shape=layer_shape,
         dtype=tree_dtype,
-        default_initializer=Constant(0),
+        default_initializer=paddle.nn.initializer.Constant(0),
     )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -1637,7 +1643,7 @@ def fused_bn_add_act(
         attr=helper.param_attr,
         shape=param_shape,
         dtype=bn_param_dtype,
-        default_initializer=Constant(1.0),
+        default_initializer=paddle.nn.initializer.Constant(1.0),
     )
     bias = helper.create_parameter(
         attr=helper.bias_attr,
@@ -1647,7 +1653,9 @@ def fused_bn_add_act(
     )
     mean = helper.create_parameter(
         attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False
+            name=moving_mean_name,
+            initializer=paddle.nn.initializer.Constant(0.0),
+            trainable=False,
         ),
         shape=param_shape,
         dtype=bn_param_dtype,
@@ -1656,7 +1664,7 @@ def fused_bn_add_act(
     variance = helper.create_parameter(
         attr=ParamAttr(
             name=moving_variance_name,
-            initializer=Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=False,
         ),
         shape=param_shape,
@@ -1720,13 +1728,16 @@ def pow2_decay_with_linear_warmup(
     helper = LayerHelper("pow2_decay_with_linear_warmup", **locals())
     lr = helper.create_global_variable(persistable=True, dtype=dtype, shape=[1])
     helper.set_variable_initializer(
-        lr, Constant(value=float(base_lr) / warmup_steps)
+        lr,
+        paddle.nn.initializer.Constant(value=float(base_lr) / warmup_steps),
     )
 
     step = helper.create_global_variable(
         persistable=True, dtype='int64', shape=[1]
     )
-    helper.set_variable_initializer(step, Constant(value=0))
+    helper.set_variable_initializer(
+        step, paddle.nn.initializer.Constant(value=0)
+    )
     assert (
         warmup_steps <= total_steps
     ), "warmup_steps cannot be larger than total_steps"
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 9aaf0f684f1e73a0d46db109376c91a0edde0751..dcdd8847d842426f5d2ca04dc7e4d2a95b7876c1 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -306,6 +306,8 @@ try:
     from .libpaddle import _Profiler, _ProfilerResult, _RecordEvent
     from .libpaddle import _set_current_stream
     from .libpaddle import _get_phi_kernel_name
+    from .libpaddle import _add_skip_comp_ops
+    from .libpaddle import _remove_skip_comp_ops
 
     # prim controller flags
     from .libpaddle import __set_bwd_prim_enabled
@@ -313,6 +315,7 @@ try:
     from .libpaddle import __set_fwd_prim_enabled
     from .libpaddle import _is_fwd_prim_enabled
     from .libpaddle import __set_all_prim_enabled
+    from .libpaddle import _set_prim_target_grad_name
 
     # custom devivce
     from .libpaddle import _get_current_custom_device_stream
@@ -408,7 +411,7 @@ def __sync_stat_with_flag(flag):
             __set_fwd_prim_enabled(True)
         else:
             raise TypeError(f"flag {flag} should be true or false.")
-        logging.debug("forward prim enabled: ", bool(_is_fwd_prim_enabled()))
+        print("forward prim enabled: ", bool(_is_fwd_prim_enabled()))
     elif flag is "FLAGS_prim_backward":
         flag_value = os.getenv("FLAGS_prim_backward")
         assert flag_value is not None
@@ -419,7 +422,7 @@ def __sync_stat_with_flag(flag):
             __set_bwd_prim_enabled(True)
         else:
             raise TypeError(f"flag {flag} should be true or false.")
-        logging.debug("backward prim enabled: ", bool(_is_bwd_prim_enabled()))
+        print("backward prim enabled: ", bool(_is_bwd_prim_enabled()))
     elif flag is "FLAGS_prim_all":
         flag_value = os.getenv("FLAGS_prim_all")
         assert flag_value is not None
@@ -430,7 +433,7 @@ def __sync_stat_with_flag(flag):
             __set_all_prim_enabled(True)
         else:
             raise TypeError(f"flag {flag} should be true or false.")
-        logging.debug(
+        print(
             "all prim enabled: ",
             bool(_is_fwd_prim_enabled() and _is_bwd_prim_enabled()),
         )
@@ -440,19 +443,24 @@ def __sync_stat_with_flag(flag):
         )
 
 
+# Alert!!! This method is only for test coveraget, user should never use it directly, this may cause serious system errors.
+def _test_use_sync(value):
+    __sync_stat_with_flag(value)
+
+
 def _set_prim_backward_enabled(value):
     __set_bwd_prim_enabled(bool(value))
-    logging.debug("backward prim enabled: ", bool(_is_bwd_prim_enabled()))
+    print("backward prim enabled: ", bool(_is_bwd_prim_enabled()))
 
 
 def _set_prim_forward_enabled(value):
     __set_fwd_prim_enabled(bool(value))
-    logging.debug("forward prim enabled: ", bool(_is_fwd_prim_enabled()))
+    print("forward prim enabled: ", bool(_is_fwd_prim_enabled()))
 
 
 def _set_prim_all_enabled(value):
     __set_all_prim_enabled(bool(value))
-    logging.debug(
+    print(
         "all prim enabled: ",
         bool(_is_fwd_prim_enabled() and _is_bwd_prim_enabled()),
     )
@@ -461,7 +469,7 @@ def _set_prim_all_enabled(value):
 def __sync_prim_backward_status():
     flag_value = os.getenv("FLAGS_prim_backward")
     if flag_value is None:
-        logging.debug("backward prim enabled: ", bool(_is_bwd_prim_enabled()))
+        print("backward prim enabled: ", bool(_is_bwd_prim_enabled()))
     else:
         __sync_stat_with_flag("FLAGS_prim_backward")
 
@@ -469,7 +477,7 @@ def __sync_prim_backward_status():
 def __sync_prim_forward_status():
     flag_value = os.getenv("FLAGS_prim_forward")
     if flag_value is None:
-        logging.debug("forward prim enabled: ", bool(_is_fwd_prim_enabled()))
+        print("forward prim enabled: ", bool(_is_fwd_prim_enabled()))
     else:
         __sync_stat_with_flag("FLAGS_prim_forward")
 
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 472bcbd3cac4be617f1c6b187bb411d1977b44eb..a4d80ecbfed25f8e4fcbc24f76b87a6a4ebe0b2d 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -20,7 +20,6 @@ from . import layers
 from .framework import Program, Variable, program_guard
 from . import unique_name
 from .layer_helper import LayerHelper
-from .initializer import Constant
 
 
 def _clone_var_(block, var):
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index da9d12802434f39326a7f276f320caecb9a05c86..6e094588e686a54d6b6ba6d4137035c2d2cb8b91 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2534,8 +2534,9 @@ class Executor:
         place = core.Place()
         place.set_place(self.place)
 
-        # NOTE: the last argument is used to force create some vars in root scope,
-        # won't be used during train.
+        inference_root_scope_vars = (
+            fleet_opt["fetch_var"] if "fetch_var" in fleet_opt else []
+        )
         self._fleet_executor.init(
             carrier_id,
             program.desc,
@@ -2544,7 +2545,7 @@ class Executor:
             num_micro_batches,
             tasks,
             task_id_to_rank,
-            [],
+            inference_root_scope_vars,
             micro_scope_list,
         )
 
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
index 23f5a44fe139e743d13352aea00029c355c50853..9fc9182017ec44a1a56b9b4df4a84a679dafd757 100644
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
@@ -109,7 +109,7 @@ def model():
         size=[dnn_input_dim, dnn_layer_dims[0]],
         param_attr=fluid.ParamAttr(
             name="deep_embedding",
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
         ),
         is_sparse=True,
     )
@@ -121,7 +121,7 @@ def model():
             size=dim,
             activation="relu",
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
             name='dnn-fc-%d' % i,
         )
@@ -134,7 +134,7 @@ def model():
         size=[lr_input_dim, 1],
         param_attr=fluid.ParamAttr(
             name="wide_embedding",
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
         ),
         is_sparse=True,
     )
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 38650856b07201c712b8c8969ce3165b25faaf03..6eb88d8f8ef3dae50ad6ec25fcc77d55210b65a1 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -30,1139 +30,24 @@ from .data_feeder import check_variable_and_dtype, check_type, check_dtype
 from paddle import _C_ops, _legacy_C_ops
 import paddle
 
-__all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'TruncatedNormal',
-    'Xavier',
-    'Bilinear',
-    'MSRA',
-    'ConstantInitializer',
-    'UniformInitializer',
-    'NormalInitializer',
-    'TruncatedNormalInitializer',
-    'XavierInitializer',
-    'BilinearInitializer',
-    'MSRAInitializer',
-    'NumpyArrayInitializer',
-    'set_global_initializer',
-]
+__all__ = ['set_global_initializer']
 
 _global_weight_initializer_ = None
 _global_bias_initializer_ = None
 
 
-class Initializer:
-    """Base class for variable initializers
-
-    Defines the common interface of variable initializers.
-    They add operations to the init program that are used
-    to initialize variables. Users should not use this class
-    directly, but need to use one of its implementations.
-    """
-
-    def __init__(self):
-        pass
-
-    def __call__(self, param, block=None):
-        if not lazy_init_helper().state:
-            return self.forward(param, block)
-
-        return self._lazy_init(param, block)
-
-    def forward(self, param, block=None):
-        """Add corresponding initialization operations to the network"""
-        raise NotImplementedError()
-
-    def _lazy_init(self, param, block=None):
-        """
-        Apply lazy initialization
-        """
-        assert in_dygraph_mode()
-
-        def init_op_creator(forward, param, block):
-            new_var = param._to_static_var(True, block=block)
-            # Record initializer operator
-            with lazy_init_helper():
-                forward(new_var, block)
-
-        # Add hook function for initializing param in dygraph mode
-        param.set_init_func(functools.partial(self.forward, param, block))
-        param._init_op_creator = functools.partial(
-            init_op_creator, self.forward, param
-        )
-
-        return param
-
-    def _check_block(self, block):
-        if block is None:
-            block = default_main_program().global_block()
-
-        return block
-
-    def _compute_fans(self, var):
-        """Compute the fan_in and the fan_out for layers
-
-        This method computes the fan_in and the fan_out
-        for neural network layers, if not specified. It is
-        not possible to perfectly estimate fan_in and fan_out.
-        This method will estimate it correctly for matrix multiply and
-        convolutions.
-
-        Args:
-            var: variable for which fan_in and fan_out have to be computed
-
-        Returns:
-            tuple of two integers (fan_in, fan_out)
-        """
-        shape = var.shape
-        if not shape or len(shape) == 0:
-            fan_in = fan_out = 1
-        elif len(shape) == 1:
-            fan_in = fan_out = shape[0]
-        elif len(shape) == 2:
-            # This is the case for simple matrix multiply
-            fan_in = shape[0]
-            fan_out = shape[1]
-        else:
-            # Assume this to be a convolutional kernel
-            # In PaddlePaddle, the shape of the kernel is like:
-            # [num_filters, num_filter_channels, ...] where the remaining
-            # dimensions are the filter_size
-            receptive_field_size = np.prod(shape[2:])
-            fan_in = shape[1] * receptive_field_size
-            fan_out = shape[0] * receptive_field_size
-
-        return (fan_in, fan_out)
-
-
-class ConstantInitializer(Initializer):
-    """Implements the constant initializer
-
-    Args:
-        value (float32): constant value to initialize the variable
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name="data", shape=[8, 32, 32], dtype="float32")
-            fc = paddle.static.nn.fc(
-                x,
-                size=10,
-                weight_attr=fluid.initializer.Constant(value=2.0))
-
-    """
-
-    def __init__(self, value=0.0, force_cpu=False):
-        assert value is not None
-        super().__init__()
-        self._value = value
-        self._force_cpu = force_cpu
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with constant.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable) or isinstance(
-            var, framework.EagerParamBase
-        )
-        assert isinstance(block, framework.Block)
-
-        if in_dygraph_mode():
-            place = _current_expected_place()
-            if self._force_cpu:
-                place = core.CPUPlace()
-            _C_ops.full_(
-                var, var.shape, str(float(self._value)), var.dtype, place
-            )
-            return None
-        else:
-            op = block.append_op(
-                type="fill_constant",
-                outputs={"Out": var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": int(var.dtype),
-                    "value": float(self._value),
-                    'str_value': str(float(self._value)),
-                    'force_cpu': self._force_cpu,
-                },
-                stop_gradient=True,
-            )
-
-            var.op = op
-            return op
-
-
-class UniformInitializer(Initializer):
-    """Implements the random uniform distribution initializer
-
-    Args:
-        low (float): lower boundary of the uniform distribution
-        high (float): upper boundary of the uniform distribution
-        seed (int): random seed
-        diag_num (int): the number of diagonal elements to initialize.
-            If set to 0, diagonal initialization will be not performed.
-        diag_step (int): Step size between two diagonal elements,
-            which is generally the width of the square matrix.
-        diag_val (float): the value of the diagonal element to be initialized,
-            default 1.0. It takes effect only if the diag_num is greater than 0.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name='x', shape=[None, 1], dtype='float32')
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
-    """
-
-    def __init__(
-        self, low=-1.0, high=1.0, seed=0, diag_num=0, diag_step=0, diag_val=1.0
-    ):
-        assert low is not None
-        assert high is not None
-        assert high >= low
-        assert seed is not None
-        assert diag_num is not None
-        assert diag_step is not None
-        assert diag_val is not None
-        if diag_num > 0 or diag_step > 0:
-            assert diag_num > 0 and diag_step > 0
-        super().__init__()
-        self._low = low
-        self._high = high
-        self._seed = seed
-        self._diag_num = diag_num
-        self._diag_step = diag_step
-        self._diag_val = diag_val
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Uniform distribution.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(block, framework.Block)
-        if not in_dygraph_mode():
-            check_variable_and_dtype(
-                var,
-                "Out",
-                ["uint16", "float16", "float32", "float64"],
-                "uniform_random",
-            )
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initializers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['uniform_random', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            out_var = _C_ops.uniform(
-                var.shape,
-                out_dtype,
-                self._low,
-                self._high,
-                self._seed,
-                _current_expected_place(),
-            )
-            if var.dtype == VarDesc.VarType.FP16:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            op = block.append_op(
-                type="uniform_random",
-                inputs={},
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "min": self._low,
-                    "max": self._high,
-                    "seed": self._seed,
-                    "diag_num": self._diag_num,
-                    "diag_step": self._diag_step,
-                    "diag_val": self._diag_val,
-                },
-                stop_gradient=True,
-            )
-
-            if var.dtype == VarDesc.VarType.FP16:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class NormalInitializer(Initializer):
-    """Implements the Random Normal(Gaussian) distribution initializer
-
-    Args:
-        loc (float): mean of the normal distribution
-        scale (float): standard deviation of the normal distribution
-        seed (int): random seed
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
-
-    """
-
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        assert loc is not None
-        assert scale is not None
-        assert seed is not None
-        super().__init__()
-        self._mean = loc
-        self._std_dev = scale
-        self._seed = seed
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Normal distribution.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(block, framework.Block)
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        if in_dygraph_mode():
-            place = _current_expected_place()
-            out_var = _C_ops.gaussian(
-                var.shape,
-                self._mean,
-                self._std_dev,
-                self._seed,
-                var.dtype,
-                place,
-            )
-            out_var._share_underline_tensor_to(var)
-            return None
-
-        else:
-            check_variable_and_dtype(
-                var,
-                "Out",
-                ["uint16", "float16", "float32", "float64"],
-                "guassian_random",
-            )
-            op = block.append_op(
-                type="gaussian_random",
-                outputs={"Out": var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": var.dtype,
-                    "mean": self._mean,
-                    "std": self._std_dev,
-                    "seed": self._seed,
-                    "use_mkldnn": False,
-                },
-                stop_gradient=True,
-            )
-            var.op = op
-            return op
-
-
-class TruncatedNormalInitializer(Initializer):
-    """Implements the Random TruncatedNormal(Gaussian) distribution initializer
-
-    Args:
-        loc (float): mean of the normal distribution
-        scale (float): standard deviation of the normal distribution
-        seed (int): random seed
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name='x', shape=[None, 1], dtype='float32')
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0))
-    """
-
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        assert loc is not None
-        assert scale is not None
-        assert seed is not None
-        super().__init__()
-        self._mean = loc
-        self._std_dev = scale
-        self._seed = seed
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with TruncatedNormal distribution.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['truncated_gaussian_random', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            out_var = _C_ops.truncated_gaussian_random(
-                var.shape,
-                self._mean,
-                self._std_dev,
-                self._seed,
-                out_dtype,
-                _current_expected_place(),
-            )
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-
-        else:
-            op = block.append_op(
-                type="truncated_gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "mean": self._mean,
-                    "std": self._std_dev,
-                    "seed": self._seed,
-                },
-                stop_gradient=True,
-            )
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-            var.op = op
-            return op
-
-
-class XavierInitializer(Initializer):
-    r"""
-    This class implements the Xavier weight initializer from the paper
-    `Understanding the difficulty of training deep feedforward neural
-    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
-    by Xavier Glorot and Yoshua Bengio.
-
-    This initializer is designed to keep the scale of the gradients
-    approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where
-
-    .. math::
-
-        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
-
-    In case of Normal distribution, the mean is 0 and the standard deviation
-    is
-
-    .. math::
-
-        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
-
-
-    Args:
-        uniform (bool,default True): whether to use uniform ,if False use normal distribution
-        fan_in (float,default None): fan_in for Xavier initialization. If None, it is
-                inferred from the variable.
-        fan_out (float,default None): fan_out for Xavier initialization. If None, it is
-                 inferred from the variable.
-        seed (int): random seed
-
-    Note:
-        It is recommended to set fan_in and fan_out to None for most cases.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            queries = fluid.data(name='x', shape=[None,1], dtype='float32')
-            fc = paddle.static.nn.fc(
-                x=queries, size=10,
-                weight_attr=fluid.initializer.Xavier(uniform=False))
-
-    """
-
-    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
-        assert uniform is not None
-        assert seed is not None
-        super().__init__()
-        self._uniform = uniform
-        self._fan_in = fan_in
-        self._fan_out = fan_out
-        self._seed = seed
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Xavier initialization.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(block, framework.Block)
-        if not in_dygraph_mode():
-            check_variable_and_dtype(
-                var,
-                "Out",
-                ["uint16", "float16", "float32", "float64"],
-                "xavier_init",
-            )
-
-        f_in, f_out = self._compute_fans(var)
-
-        # If fan_in and fan_out are passed, use them
-        fan_in = f_in if self._fan_in is None else self._fan_in
-        fan_out = f_out if self._fan_out is None else self._fan_out
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16 or (
-            var.dtype == VarDesc.VarType.BF16 and not self._uniform
-        ):
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['xavier_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            if self._uniform:
-                limit = math.sqrt(6.0 / float(fan_in + fan_out))
-                out_var = _C_ops.uniform(
-                    out_var.shape,
-                    out_dtype,
-                    -limit,
-                    limit,
-                    self._seed,
-                    _current_expected_place(),
-                )
-            else:
-                std = math.sqrt(2.0 / float(fan_in + fan_out))
-
-                place = _current_expected_place()
-                out_var = _C_ops.gaussian(
-                    out_var.shape, 0.0, std, self._seed, out_dtype, place
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            if self._uniform:
-                limit = math.sqrt(6.0 / float(fan_in + fan_out))
-                op = block.append_op(
-                    type="uniform_random",
-                    inputs={},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": out_dtype,
-                        "min": -limit,
-                        "max": limit,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-            else:
-                std = math.sqrt(2.0 / float(fan_in + fan_out))
-                op = block.append_op(
-                    type="gaussian_random",
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": out_var.dtype,
-                        "mean": 0.0,
-                        "std": std,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class MSRAInitializer(Initializer):
-    r"""Implements the MSRA initializer a.k.a. Kaiming Initializer
-
-    This class implements the weight initialization from the paper
-    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
-    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
-    robust initialization method that particularly considers the rectifier
-    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
-
-    .. math::
-
-        x = gain \times \sqrt{\frac{3}{fan\_in}}
-
-    In case of Normal distribution, the mean is 0 and the standard deviation
-    is
-
-    .. math::
-
-        \frac{gain}{\sqrt{{fan\_in}}}
-
-    Args:
-        uniform (bool, optional): whether to use uniform or normal distribution
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. default is None.
-        seed (int32, optional): random seed.
-        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
-        nonlinearity(str, optional): the non-linear function. default is relu.
-
-    Note:
-        It is recommended to set fan_in to None for most cases.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name="data", shape=[8, 32, 32], dtype="float32")
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.MSRA(uniform=False))
-
+def _global_weight_initializer():
     """
-
-    def __init__(
-        self,
-        uniform=True,
-        fan_in=None,
-        seed=0,
-        negative_slope=0,
-        nonlinearity='relu',
-    ):
-        """Constructor for MSRAInitializer"""
-        assert uniform is not None
-        assert seed is not None
-        super().__init__()
-        self._uniform = uniform
-        self._fan_in = fan_in
-        self._seed = seed
-        self._negative_slope = negative_slope
-        self._nonlinearity = nonlinearity
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with MSRA initialization.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-        f_in, f_out = self._compute_fans(var)
-
-        # If fan_in is passed, use it
-        fan_in = f_in if self._fan_in is None else self._fan_in
-
-        if fan_in == 0:
-            if self._fan_in is None:
-                raise ValueError(
-                    "The in_features of the Tensor contain zero, can not initialize the Tensor."
-                )
-            else:
-                raise ValueError(
-                    "fan_in should not be zero, can not initialize the Tensor."
-                )
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16 or (
-            var.dtype == VarDesc.VarType.BF16 and not self._uniform
-        ):
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['masra_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            if self._uniform:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                limit = gain * math.sqrt(3.0 / float(fan_in))
-                out_var = _C_ops.uniform(
-                    var.shape,
-                    out_dtype,
-                    -limit,
-                    limit,
-                    self._seed,
-                    _current_expected_place(),
-                )
-            else:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                std = gain / math.sqrt(float(fan_in))
-                place = _current_expected_place()
-                out_var = _C_ops.gaussian(
-                    out_var.shape, 0.0, std, self._seed, out_dtype, place
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            if self._uniform:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                limit = gain * math.sqrt(3.0 / float(fan_in))
-                op = block.append_op(
-                    type="uniform_random",
-                    inputs={},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "min": -limit,
-                        "max": limit,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-
-            else:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                std = gain / math.sqrt(float(fan_in))
-                op = block.append_op(
-                    type="gaussian_random",
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "mean": 0.0,
-                        "std": std,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class BilinearInitializer(Initializer):
+    Return the global weight initializer, The user doesn't need to use it.
     """
-    This initializer can be used in transposed convolution operator to
-    act as upsampling. Users can upsample a feature map with shape of
-    (B, C, H, W) by any integer factor. The usage is:
-
-    Examples:
-
-        .. code-block:: python
-
-            import math
-
-            import paddle
-            import paddle.nn as nn
-            from paddle.regularizer import L2Decay
-
-            factor = 2
-            C = 2
-            B = 8
-            H = W = 32
-            w_attr = paddle.ParamAttr(learning_rate=0.,
-                                      regularizer=L2Decay(0.),
-                                      initializer=nn.initializer.Bilinear())
-            data = paddle.rand([B, 3, H, W], dtype='float32')
-            conv_up = nn.Conv2DTranspose(3,
-                                         out_channels=C,
-                                         kernel_size=2 * factor - factor % 2,
-                                         padding=int(
-                                             math.ceil((factor - 1) / 2.)),
-                                         stride=factor,
-                                         weight_attr=w_attr,
-                                         bias_attr=False)
-            x = conv_up(data)
+    return _global_weight_initializer_
 
-    Where, `out_channels=C` and `groups=C` means this is channel-wise transposed
-    convolution. The filter shape will be (C, 1, K, K) where K is `kernel_size`,
-    This initializer will set a (K, K) interpolation kernel for every channel
-    of the filter identically. The resulting shape of the output feature map
-    will be (B, C, factor * H, factor * W). Note that the learning rate and the
-    weight decay are set to 0 in order to keep coefficient values of bilinear
-    interpolation unchanged during training.
 
+def _global_bias_initializer():
     """
-
-    def __init__(self):
-        """Constructor for BilinearInitializer."""
-        super().__init__()
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Bilinear initialization.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        if not isinstance(var, framework.Variable):
-            raise ValueError("var must be framework.Variable.")
-
-        if not isinstance(block, framework.Block):
-            raise ValueError("block must be framework.Block.")
-
-        shape = var.shape
-        if len(shape) != 4:
-            raise ValueError("the length of shape must be 4.")
-        if shape[2] != shape[3]:
-            raise ValueError("shape[2] must be equal to shape[3].")
-
-        weight = np.zeros(np.prod(var.shape), dtype='float32')
-        size = shape[3]
-        # factor
-        f = np.ceil(size / 2.0)
-        # center
-        c = (2 * f - 1 - f % 2) / (2.0 * f)
-        for i in range(np.prod(shape)):
-            x = i % size
-            y = (i / size) % size
-            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
-        weight = np.reshape(weight, shape)
-
-        # to be compatible of fp16 initalizers
-        if var.dtype in [
-            VarDesc.VarType.FP16,
-            VarDesc.VarType.BF16,
-            VarDesc.VarType.FP64,
-        ]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['bilinear_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if out_dtype == VarDesc.VarType.FP32:
-            value_name = "fp32_values"
-            values = [float(v) for v in weight.flat]
-        else:
-            raise TypeError("Unsupported dtype %s", var.dtype)
-
-        if np.prod(shape) > 1024 * 1024:
-            raise ValueError("The size of input is too big. ")
-
-        if in_dygraph_mode():
-            _C_ops.assign_value_(
-                out_var,
-                list(shape),
-                out_dtype,
-                values,
-                _current_expected_place(),
-            )
-            if var.dtype in [
-                VarDesc.VarType.FP16,
-                VarDesc.VarType.BF16,
-                VarDesc.VarType.FP64,
-            ]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            op = block.append_op(
-                type='assign_value',
-                outputs={'Out': [out_var]},
-                attrs={
-                    'dtype': out_dtype,
-                    'shape': list(shape),
-                    value_name: values,
-                },
-            )
-
-            if var.dtype in [
-                VarDesc.VarType.FP16,
-                VarDesc.VarType.BF16,
-                VarDesc.VarType.FP64,
-            ]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class NumpyArrayInitializer(Initializer):
-    """Init an parameter with an numpy array
-    This op initialize the variable by numpy array.
-
-    Args:
-        value (numpy): numpy array to initialize the variable
-
-    Returns:
-        A Tensor variable initialized by numpy.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy
-            paddle.enable_static()
-            x = fluid.data(name="x", shape=[2, 1], dtype='float32')
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
+    Return the global weight initializer, The user doesn't need to use it.
     """
-
-    def __init__(self, value):
-        import numpy
-
-        assert isinstance(value, numpy.ndarray)
-        super().__init__()
-        self._value = value
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Numpy array.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            np_value = self._value.astype("float32")
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['numpy_array_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_var = var
-            out_dtype = var.dtype
-            np_value = self._value
-
-        if out_dtype == VarDesc.VarType.FP32:
-            value_name = "fp32_values"
-            values = [float(v) for v in np_value.flat]
-        elif out_dtype == VarDesc.VarType.INT32:
-            value_name = "int32_values"
-            values = [int(v) for v in np_value.flat]
-        else:
-            raise ValueError("Unsupported dtype %s", self._value.dtype)
-        if self._value.size > 1024 * 1024 * 1024:
-            raise ValueError(
-                "The size of input is too big. Please consider "
-                "saving it to file and 'load_op' to load it"
-            )
-
-        if in_dygraph_mode():
-            _C_ops.assign_value_(
-                out_var,
-                list(self._value.shape),
-                out_dtype,
-                values,
-                _current_expected_place(),
-            )
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            op = block.append_op(
-                type='assign_value',
-                outputs={'Out': out_var},
-                attrs={
-                    'dtype': out_dtype,
-                    'shape': list(self._value.shape),
-                    value_name: values,
-                },
-                stop_gradient=True,
-            )
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
+    return _global_bias_initializer_
 
 
 def set_global_initializer(weight_init, bias_init=None):
@@ -1218,7 +103,7 @@ def set_global_initializer(weight_init, bias_init=None):
     check_type(
         weight_init,
         'weight_init',
-        (Initializer, type(None)),
+        (paddle.nn.initializer.Initializer, type(None)),
         'set_global_initializer',
     )
     global _global_weight_initializer_
@@ -1227,93 +112,8 @@ def set_global_initializer(weight_init, bias_init=None):
     check_type(
         bias_init,
         'bias_init',
-        (Initializer, type(None)),
+        (paddle.nn.initializer.Initializer, type(None)),
         'set_global_initializer',
     )
     global _global_bias_initializer_
     _global_bias_initializer_ = bias_init
-
-
-def _global_weight_initializer():
-    """
-    Return the global weight initializer, The user doesn't need to use it.
-    """
-    return _global_weight_initializer_
-
-
-def _global_bias_initializer():
-    """
-    Return the global weight initializer, The user doesn't need to use it.
-    """
-    return _global_bias_initializer_
-
-
-def calculate_gain(nonlinearity, param=None):
-    """
-    Get the recommended ``gain`` value of some nonlinearity function. ``gain`` value can be used in some
-    ``paddle.nn.initializer`` api to adjust the initialization value.
-
-    Args:
-        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, such as:
-            `linear/conv1d/conv2d/conv3d/conv1d_transpose/conv2d_transpose/conv3d_transpose` , 1.0 will be returned.
-        param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to
-            'leaky_relu'. Default: None, it will be calculated as 0.01 in the formula.
-
-    Returns:
-        A float value, which is the recommended gain for this nonlinearity function.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3
-            gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2))
-            initializer = paddle.nn.initializer.Orthogonal(gain)
-
-    """
-    if param is None:
-        param = 0.01
-    else:
-        assert isinstance(param, (bool, int, float))
-        param = float(param)
-    recommended_gain = {
-        'sigmoid': 1,
-        'linear': 1,
-        'conv1d': 1,
-        'conv2d': 1,
-        'conv3d': 1,
-        'conv1d_transpose': 1,
-        'conv2d_transpose': 1,
-        'conv3d_transpose': 1,
-        'tanh': 5.0 / 3,
-        'relu': math.sqrt(2.0),
-        'leaky_relu': math.sqrt(2.0 / (1 + param**2)),
-        'selu': 3.0 / 4,
-    }
-    if nonlinearity in recommended_gain.keys():
-        return recommended_gain[nonlinearity]
-    else:
-        raise ValueError(
-            "nonlinearity function {} is not suppported now.".format(
-                nonlinearity
-            )
-        )
-
-
-# We short the class name, since users will use the initializer with the package
-# name. The sample code:
-#
-# import paddle
-# import paddle.fluid as fluid
-#
-# hidden = paddle.static.nn.fc(...,
-#                          weight_attr=ParamAttr(fluid.initializer.Xavier()))
-#
-# It is no need to add an `Initializer` as the class suffix
-Constant = ConstantInitializer
-Uniform = UniformInitializer
-Normal = NormalInitializer
-TruncatedNormal = TruncatedNormalInitializer
-Xavier = XavierInitializer
-MSRA = MSRAInitializer
-Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 241dd71e200aba975b4660afbcdf078d61fad2dc..ce93a25ccef9a8dab69a9d3945ff4c2983fae1aa 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -22,7 +22,6 @@ from .framework import (
     cpu_places,
 )
 from .param_attr import ParamAttr
-from .initializer import Constant
 from . import layers
 from . import backward
 from .dygraph import Layer
@@ -42,7 +41,9 @@ class SimpleLayer(Layer):
         self._linear1 = paddle.nn.Linear(
             input_size,
             3,
-            weight_attr=ParamAttr(initializer=Constant(value=0.1)),
+            weight_attr=ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.1)
+            ),
         )
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 9c3de1ba49862afe8f63a81df2a0e87f81ea8fed..0342017822cfdb52b24364b0fdc879cba9767880 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import copy
-
+import paddle
 from .framework import (
     Parameter,
     dtype_is_floating,
@@ -22,7 +22,6 @@ from .framework import (
     _global_flags,
 )
 from . import unique_name
-from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr
 from . import core
 
@@ -178,10 +177,10 @@ class LayerHelper(LayerHelperBase):
     # TODO (jiabin): should we remove this since it has never be used
     def _get_default_initializer(self, dtype):
         if dtype is None or dtype_is_floating(dtype) is True:
-            return Xavier()
+            return paddle.nn.initializer.XavierUniform()
         else:
             # For integer and boolean types, initialize with all zeros
-            return Constant()
+            return paddle.nn.initializer.Constant()
 
     # TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of kwargs
     def is_instance(self, param_name, cls):
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 994fc98038086cbe581ed8e30ea183a75cc168f5..eb4d227f914ff8330876525ce9827e74d5e4ab27 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -92,7 +92,7 @@ class ListenAndServ:
                         shape=[32, 32],
                         dtype='float32',
                         name="X")
-                    fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                    paddle.nn.initializer.Constant(value=1.0)(x, main.global_block())
                     paddle.scale(x=x, scale=10.0, out=out_var)
 
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fa0f49d01b99796c47f33269775ad73cc4c186eb..1dd819df4116877060275f68596e8e32792af4d0 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -22,7 +22,6 @@ import numpy as np
 
 import paddle
 from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
 from ..framework import (
     Variable,
     OpProtoHolder,
@@ -240,7 +239,7 @@ def embedding(
           w_param_attrs = fluid.ParamAttr(
               name="emb_weight",
               learning_rate=0.5,
-              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
+              initializer=paddle.nn.initializer.Assign(weight_data),
               trainable=True)
           emb_2 = fluid.layers.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')
     """
@@ -673,7 +672,10 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     )
     if is_new_var:
         helper.set_variable_initializer(
-            counter, initializer=Constant(value=begin - 1, force_cpu=True)
+            counter,
+            initializer=paddle.nn.initializer.ConstantInitializer(
+                value=begin - 1, force_cpu=True
+            ),
         )
         helper.main_program.global_block()._prepend_op(
             type='increment',
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 7cf049fd05d51bb45560a5b03971f464705cbc24..5a776ad22cb4542526d3fcdc62f92dd41267c10a 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -26,6 +26,9 @@ from ..layer_helper import LayerHelper
 from sys import version_info
 
 from collections.abc import Sequence
+from weakref import WeakKeyDictionary
+from collections import defaultdict
+from uuid import uuid4
 
 
 def convert_to_list(value, n, name, dtype=int):
@@ -107,13 +110,33 @@ def is_sequence(seq):
     return isinstance(seq, Sequence) and not isinstance(seq, str)
 
 
+class UniqueIdMap(WeakKeyDictionary):
+    def __init__(self):
+        super().__init__(self)
+        self.data = defaultdict(uuid4)
+
+
+uniqueidmap = UniqueIdMap()
+
+
+def uniqueid(obj):
+    if isinstance(obj, str):
+        return (hash(obj),)
+    elif isinstance(obj, list):
+        return (id(obj),)
+    else:
+        return (uniqueidmap[obj].int,)
+
+
 def _hash_with_id(*args):
     """
     Return int hash value calculated by id(arg) or tuple(id1,id2, ...).
     """
     assert len(args) > 0
-    info = tuple([id(v) for v in args])
-    return hash(info) & 0xFFFFFFF
+    info = ()
+    for v in args:
+        info = info + uniqueid(v)
+    return hash(info)
 
 
 def _sorted(dict_):
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 5d702b8e521bfb19897a553819499e01ffc82e1e..b04611db668660ea80eee2aab0674a597bc760a1 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -19,7 +19,6 @@ import numpy as np
 import copy
 
 from .layer_helper import LayerHelper
-from .initializer import Constant
 from . import unique_name
 from .framework import Program, Variable, program_guard
 from . import layers
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c5aa80c74902732c20956e3d8265138381e7b205..d7ab914f80ffcf55c5d935107f0fdf7bca3c3131 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -39,7 +39,6 @@ from .backward import (
     _get_no_grad_set_name,
 )
 from .framework import program_guard
-from .initializer import Constant
 from .layer_helper import LayerHelper
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
@@ -397,7 +396,8 @@ class Optimizer:
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
-                lr_var, initializer=Constant(value=lr_value)
+                lr_var,
+                initializer=paddle.nn.initializer.Constant(value=lr_value),
             )
             return
 
@@ -713,7 +713,10 @@ class Optimizer:
             device = self._get_device_for_param(param.name)
         with device_guard(device):
             self.helper.set_variable_initializer(
-                var, initializer=Constant(value=float(fill_value))
+                var,
+                initializer=paddle.nn.initializer.Constant(
+                    value=float(fill_value)
+                ),
             )
 
         if in_dygraph_mode():
@@ -774,7 +777,10 @@ class Optimizer:
             device = 'cpu'
         with device_guard(device):
             self.helper.set_variable_initializer(
-                var, initializer=Constant(value=float(fill_value))
+                var,
+                initializer=paddle.nn.initializer.Constant(
+                    value=float(fill_value)
+                ),
             )
 
         if in_dygraph_mode():
@@ -1225,10 +1231,12 @@ class Optimizer:
         # NOTE(zhiqiu): the initializer should be set after coalesce_tensor op,
         # so the shape of flatten_param and flatten_grad will be inferred.
         self.helper.set_variable_initializer(
-            flatten_param, initializer=Constant(0.0)
+            flatten_param,
+            initializer=paddle.nn.initializer.Constant(0.0),
         )
         self.helper.set_variable_initializer(
-            flatten_grad, initializer=Constant(0.0)
+            flatten_grad,
+            initializer=paddle.nn.initializer.Constant(0.0),
         )
 
         return [(flatten_param, flatten_grad)]
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index f251a654a992b58fab2beec6bb6ee0ba1817db37..6fdadd7904bd437f7eb71fdabf9fbf93b6ba74d6 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .initializer import Initializer, Xavier, Constant
+import paddle
 from .regularizer import WeightDecayRegularizer
 from paddle.fluid.data_feeder import check_type
 
@@ -88,7 +88,10 @@ class ParamAttr:
         check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
         check_type(need_clip, "need_clip", (bool), "ParamAttr")
         check_type(
-            initializer, "initializer", (Initializer, type(None)), "ParamAttr"
+            initializer,
+            "initializer",
+            (paddle.nn.initializer.Initializer, type(None)),
+            "ParamAttr",
         )
         check_type(
             regularizer,
@@ -139,7 +142,7 @@ class ParamAttr:
         Returns:
             None.
         """
-        self._set_default_initializer(Xavier())
+        self._set_default_initializer(paddle.nn.initializer.XavierUniform())
 
     def _set_default_bias_initializer(self):
         """
@@ -151,7 +154,7 @@ class ParamAttr:
         Returns:
             None.
         """
-        self._set_default_initializer(Constant(0.0))
+        self._set_default_initializer(paddle.nn.initializer.Constant(0.0))
 
     @staticmethod
     def _to_attr(arg):
@@ -177,7 +180,7 @@ class ParamAttr:
             return arg
         elif isinstance(arg, str):
             return ParamAttr(name=arg)
-        elif isinstance(arg, Initializer):
+        elif isinstance(arg, paddle.nn.initializer.Initializer):
             return ParamAttr(initializer=arg)
         elif isinstance(arg, WeightDecayRegularizer):
             return ParamAttr(regularizer=arg)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster_partition.py
index 2223724c2953976d66088de85a7ef7f6d760ef13..9071b481eb5c473b5ffd8ff4d11c7d2e5ba96930 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster_partition.py
@@ -17,7 +17,7 @@ import unittest
 
 class TestClusterPartition(unittest.TestCase):
     def test_cluster_partition(self):
-        clusters = [(5, 8), (1, 8), (4, 8), (16, 8)]
+        clusters = [(5, 8), (1, 8), (4, 8), (16, 8), (2, 8), (3, 8)]
         from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
             ClusterPartitionUtil,
         )
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 593d79998a2d15d8acbae316a58a4828ad3ba5ce..80ebe78963287ae7aa2ee9a080f67661582d847f 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -36,7 +36,6 @@ from paddle.distributed.auto_parallel.utils import (
     save_distributed_checkpoint,
 )
 from paddle.distributed.fleet import auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -55,8 +54,12 @@ class MLPLayer(nn.Layer):
         np.random.seed(2021)
         arr0 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
         arr1 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
-        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
-        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
+        weight_attr0 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr0)
+        )
+        weight_attr1 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr1)
+        )
         bias_attr = None
         self.linear0 = nn.Linear(
             d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
index 875536af57a3552e398568f8c6ac0f7dda8ed4ce..1cb2a3e9bf1fe5c68fe41c2d5cb54fd26b63e9d3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -30,7 +30,6 @@ from paddle.distributed.auto_parallel.utils import (
     save_distributed_checkpoint,
 )
 from paddle.distributed.fleet import auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -48,7 +47,9 @@ class MLPLayer(nn.Layer):
         dim_feedforward = intermediate_size
         np.random.seed(2021)
         arr = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
-        weight_attr = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr))
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr)
+        )
         bias_attr = None
 
         self.linear0 = nn.Linear(
diff --git a/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
index 59eee4cfeee2f8dc5900e8e55151f9cdbd00f676..c1ed3175e100e515da20c94dc13e900bf2298d82 100644
--- a/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
@@ -38,15 +38,11 @@ class TestColumnParallelLinearAPI(TestCollectiveAPIRunnerBase):
             paddle.distributed.broadcast(data, src=0)
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[:, 0:8]
-                    ),
+                    initializer=paddle.nn.initializer.Assign(np_array[:, 0:8]),
                 )
             else:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[:, 8:16]
-                    ),
+                    initializer=paddle.nn.initializer.Assign(np_array[:, 8:16]),
                 )
 
             linear_out = paddle.distributed.split(
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
index a4d20264e73016c3a6dd34d8f240f14f243370c6..ca4ad63066ee820aeff68e10600f523b311a4cd6 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -242,10 +242,10 @@ class PrePostProcessLayer(Layer):
                 self._layer_norm = paddle.nn.LayerNorm(
                     normalized_shape=d_model,
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(0.0)
+                        initializer=paddle.nn.initializer.Constant(0.0)
                     ),
                 )
 
@@ -513,7 +513,9 @@ class PrepareEncoderDecoderLayer(Layer):
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
+                initializer=paddle.nn.initializer.Normal(
+                    0.0, src_emb_dim**-0.5
+                ),
             ),
         )
 
@@ -527,7 +529,7 @@ class PrepareEncoderDecoderLayer(Layer):
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=pos_enc_param_name,
-                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
+                initializer=paddle.nn.initializer.Assign(pos_inp),
                 trainable=False,
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
index 2eb0951756a59eaa601549e3974872110e1ad4a4..1fff26b20b191cd938e1a6032a86b3768387c0dc 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         # To cover @RENAMED@GRADIENT
@@ -74,7 +74,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         predict += predict2
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
index e094d932d33e48c2a7d51a6c30859cebad6aadf6..a1d8688fd41c3c3635ed50eaa07ca2f884a453cc 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         # To cover @RENAMED@GRADIENT
@@ -74,7 +74,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         predict += predict2
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
index 7e442f1914b2cf05d6f4f8f71856c02add563991..74c3c1a7269e4298709111ada3b78a6ca65b2da5 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -64,7 +64,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
index 7f247abc6d9cd54d90ae419398eac745a5f72b83..035a174775bd533737a284fb35139899275377ec 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
@@ -33,11 +33,9 @@ OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
index b63e2065f431b7c4609a474657040388481ab3cd..a480993e8ec50236ace10696ca56026314da83c5 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
@@ -33,11 +33,9 @@ OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
@@ -65,7 +63,7 @@ def create_model(data, rank):
             data,
             size=OUT_SIZE,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)
+                initializer=paddle.nn.initializer.Assign(np_weight)
             ),
             bias_attr=bias_attr,
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
index 914ee0852a04349323f7969117112526b4fa922b..689b068f025f27a59dfbb8920cc0d1dd5f9e2c43 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
@@ -44,9 +44,7 @@ def create_model(data, rank):
             axis=0,
             num_partitions=MODEL_PARALLEL_SIZE,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    np_weight_part
-                )
+                initializer=paddle.nn.initializer.Assign(np_weight_part)
             ),
             bias_attr=False,
         )
@@ -55,7 +53,7 @@ def create_model(data, rank):
             data,
             size=OUT_SIZE,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)
+                initializer=paddle.nn.initializer.Assign(np_weight)
             ),
             bias_attr=False,
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
index de839e2c5eea44da1bc038b98c121744de1ebaff..ec864a1e40f9e6b0d16944c54cf33190005aa240 100644
--- a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
@@ -35,7 +35,7 @@ def weight_init(mp, shape, col=True, seed=1024):
         else:
             step = shape[0] // mp.nranks
             _w = w[mp.rank * step : mp.rank * step + step, :]
-    return paddle.fluid.initializer.NumpyArrayInitializer(_w)
+    return paddle.nn.initializer.Assign(_w)
 
 
 class Criterion(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
index 160ed85cc9424e6de6b4b56d69fc7ce6a333cdd4..3f9527209134faa47f966a937c4e635968cc67a2 100644
--- a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
@@ -35,7 +35,7 @@ def weight_init(mp, shape, col=True, seed=1024):
         else:
             step = shape[0] // mp.nranks
             _w = w[mp.rank * step : mp.rank * step + step, :]
-    return paddle.fluid.initializer.NumpyArrayInitializer(_w)
+    return paddle.nn.initializer.Assign(_w)
 
 
 class Criterion(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
index 31daee3262291e29d8b3a61fd6ebd72aa9fb59ae..af2b1b616d132d6c108b6c7d349abba07be6e18c 100644
--- a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
@@ -36,7 +36,7 @@ def weight_init(mp, shape, col=True, seed=1024):
         else:
             step = shape[0] // mp.nranks
             _w = w[mp.rank * step : mp.rank * step + step, :]
-    return paddle.fluid.initializer.NumpyArrayInitializer(_w)
+    return paddle.nn.initializer.Assign(_w)
 
 
 class Criterion(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py b/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py
index 9dd3bade93aee6ef9c4dac77c98d805fbc351055..f89643e7bff5e0213ed5ef86c5f4b4b4ed563ee5 100644
--- a/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py
@@ -42,13 +42,13 @@ class TestParallelEmbeddingAPI(TestCollectiveAPIRunnerBase):
             per_part_size = size[0] // 2
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[0:per_part_size, :]
                     ),
                 )
             else:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[per_part_size : size[0], :]
                     ),
                 )
diff --git a/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
index afbb3f33343649f57bb8209a489f70b3442bf00b..6c3817da5ae90c5e3bd727979515c25b2fe6b960 100644
--- a/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
@@ -39,13 +39,13 @@ class TestRowParallelLinearAPI(TestCollectiveAPIRunnerBase):
             data = paddle.split(data, 2, axis=1)[rank]
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[0:500, :]
                     ),
                 )
             else:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[500:1000, :]
                     ),
                 )
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
index 30bcea4cb5cb2344f3d50e3d10d1951dbedabc71..044c6d78cac10fc15271cf3c7b34dec338ccbf50 100644
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -38,7 +38,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -49,7 +49,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -63,7 +63,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index deb4cb921c1f3a4809e3d97dd1d7a0b0a78d5251..dc9bd59df52fd1bbc4a68643cf54b45501e9f67f 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -60,7 +60,7 @@ class TestDistCTR2x2(TestDistRunnerBase):
             size=[dnn_input_dim, dnn_layer_dims[0]],
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=IS_SPARSE,
         )
@@ -74,7 +74,7 @@ class TestDistCTR2x2(TestDistRunnerBase):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
@@ -87,7 +87,7 @@ class TestDistCTR2x2(TestDistRunnerBase):
             size=[lr_input_dim, 1],
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=IS_SPARSE,
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 8e9341f9c5b1a6b27d9e7c447e35cddcca71e13a..527ba34bae614133ab13d588010bfa9b8972d595 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -107,7 +107,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             size=[dnn_input_dim, dnn_layer_dims[0]],
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
             padding_idx=0,
@@ -122,7 +122,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
@@ -135,7 +135,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             size=[lr_input_dim, 1],
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
             padding_idx=0,
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
index 3e71a1cb6054d6a9a70f6e0c26e7ca4da67a4711..de0f32e3110a55626c2e3cb30277bbf1948385f1 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
@@ -78,7 +78,7 @@ class TestHeterPipelinePsCTR2x2(FleetDistHeterRunnerBase):
                 size=[dnn_input_dim, dnn_layer_dims[0]],
                 param_attr=fluid.ParamAttr(
                     name="deep_embedding",
-                    initializer=fluid.initializer.Constant(value=0.01),
+                    initializer=paddle.nn.initializer.Constant(value=0.01),
                 ),
                 is_sparse=True,
             )
@@ -94,7 +94,7 @@ class TestHeterPipelinePsCTR2x2(FleetDistHeterRunnerBase):
                 size=[lr_input_dim, 1],
                 param_attr=fluid.ParamAttr(
                     name="wide_embedding",
-                    initializer=fluid.initializer.Constant(value=0.01),
+                    initializer=paddle.nn.initializer.Constant(value=0.01),
                 ),
                 is_sparse=True,
             )
@@ -109,7 +109,7 @@ class TestHeterPipelinePsCTR2x2(FleetDistHeterRunnerBase):
                     size=dim,
                     activation="relu",
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=0.01)
+                        initializer=paddle.nn.initializer.Constant(value=0.01)
                     ),
                     name='dnn-fc-%d' % i,
                 )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
index dc0a7022b34348d7ee6aa40e2c89598cdc7ea0a3..453b715b50394c5fa744eddcd238efad828b2e79 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
@@ -40,7 +40,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -51,7 +51,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
index ac1a4c632fd495d40e905d7ac295f30d8d8fd7e5..25f8663c7406af3d733c42b4a119614daaf24c19 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -40,7 +40,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -51,7 +51,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index b673bfeae16e2563333ac9e77d9f5d087b2fb734..bd4fc90fd244f3bc92afac80c64a417dd0371720 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -124,7 +124,8 @@ def train_network(
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__emb__",
         ),
         is_sparse=is_sparse,
     )
@@ -137,7 +138,7 @@ def train_network(
         x=q_ss,
         size=hid_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             name="__q_fc__",
             learning_rate=base_lr,
         ),
@@ -149,7 +150,7 @@ def train_network(
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             name="__emb__",
             learning_rate=emb_lr,
         ),
@@ -164,7 +165,8 @@ def train_network(
         x=pt_ss,
         size=hid_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__fc__",
         ),
         bias_attr=fluid.ParamAttr(name="__fc_b__"),
     )
@@ -175,7 +177,8 @@ def train_network(
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__emb__",
         ),
         is_sparse=is_sparse,
     )
@@ -188,7 +191,8 @@ def train_network(
         x=nt_ss,
         size=hid_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__fc__",
         ),
         bias_attr=fluid.ParamAttr(name="__fc_b__"),
     )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index a9a2d7be0ba413c337b6e2cd545f5e8e36764102..1780e7dfe2ddeb626a468436aa75141f8828cf55 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -86,11 +86,11 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         inference = bool(int(os.getenv("INFERENCE", "0")))
 
         if initializer == 0:
-            init = fluid.initializer.Constant(value=0.01)
+            init = paddle.nn.initializer.Constant(value=0.01)
         elif initializer == 1:
-            init = fluid.initializer.Uniform()
+            init = paddle.nn.initializer.Uniform()
         elif initializer == 2:
-            init = fluid.initializer.Normal()
+            init = paddle.nn.initializer.Normal()
         else:
             raise ValueError("error initializer code: {}".format(initializer))
 
@@ -113,7 +113,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
@@ -127,7 +127,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             entry=entry,
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 87eb22dceac1ccca81765b83f43abb29060d0229..30c1130e33c850365d2e374645f15da5dd8be5c5 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -64,7 +64,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index db3318d67d88ab8f629f12e42c3734f54fa0f19b..6482ac53b09d8ad352797d6a2179c4a7a83934b6 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -121,7 +121,7 @@ class SE_ResNeXt:
             size=class_dim,
             activation='softmax',
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
         )
         return out
@@ -174,7 +174,7 @@ class SE_ResNeXt:
             act=None,
             # avoid pserver CPU init differs from GPU
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
             bias_attr=False,
         )
@@ -187,7 +187,7 @@ class SE_ResNeXt:
             x=pool,
             size=num_channels // reduction_ratio,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
             activation='relu',
         )
@@ -196,7 +196,7 @@ class SE_ResNeXt:
             x=squeeze,
             size=num_channels,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
             activation='sigmoid',
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
index a287bd8a6c878400ac29e9b15f278450fa5132d3..d29997ef8a08efb8c8e4fe032463c195eb053bab 100644
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -59,7 +59,7 @@ def conv_net(
         size=[dict_dim, emb_dim],
         is_sparse=False,
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -70,7 +70,7 @@ def conv_net(
         act="tanh",
         pool_type="max",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -78,7 +78,7 @@ def conv_net(
         x=[conv_3],
         size=fc0_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -87,7 +87,7 @@ def conv_net(
         size=class_dim,
         activation="softmax",
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index 5b0343bd81c245d3780138146882973ec259ad10..e9ce91c197c1aab82efa03664ed09810f2b36573 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -29,7 +29,9 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.nn.functional as F
 
-const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001))
+const_para_attr = fluid.ParamAttr(
+    initializer=paddle.nn.initializer.Constant(0.001)
+)
 const_bias_attr = const_para_attr
 
 # Fix seed for test
@@ -1253,8 +1255,8 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.0):
             out = layers.layer_norm(
                 out,
                 begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.0),
-                bias_attr=fluid.initializer.Constant(0.0),
+                param_attr=paddle.nn.initializer.Constant(1.0),
+                bias_attr=paddle.nn.initializer.Constant(0.0),
             )
         elif cmd == "d":  # add dropout
             if dropout_rate:
@@ -1292,7 +1294,7 @@ def prepare_encoder(
             size=[src_vocab_size, src_emb_dim],
             param_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.ConstantInitializer(0.001),
+                initializer=paddle.nn.initializer.Constant(0.001),
             ),
         )
     else:
@@ -1301,7 +1303,9 @@ def prepare_encoder(
             size=[src_vocab_size, src_emb_dim],
             param_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
+                initializer=paddle.nn.initializer.Normal(
+                    0.0, src_emb_dim**-0.5
+                ),
             ),
         )
 
@@ -1312,7 +1316,7 @@ def prepare_encoder(
         param_attr=fluid.ParamAttr(
             name=pos_enc_param_name,
             trainable=False,
-            initializer=fluid.initializer.ConstantInitializer(0.001),
+            initializer=paddle.nn.initializer.Constant(0.001),
         ),
     )
     src_pos_enc.stop_gradient = True
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
index e10131667c745fd6da97f3fb1a742d60b7fcf3df..f5de20385f26baeef4038f2553350417dcbe0bb1 100644
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -41,7 +41,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
             embed_second = fluid.layers.embedding(
@@ -51,7 +51,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
             embed_third = fluid.layers.embedding(
@@ -61,7 +61,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
             embed_forth = fluid.layers.embedding(
@@ -71,7 +71,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
 
@@ -84,7 +84,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                 size=HIDDEN_SIZE,
                 activation='sigmoid',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.1)
+                    initializer=paddle.nn.initializer.Constant(value=0.1)
                 ),
             )
             predict_word = paddle.static.nn.fc(
@@ -92,7 +92,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                 size=dict_size,
                 activation='softmax',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.1)
+                    initializer=paddle.nn.initializer.Constant(value=0.1)
                 ),
             )
             cost = paddle.nn.functional.cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
index a99b56974a8ae38ae3c269f0f4622cd5b5565983..9b9d45db082c0126765a5878409b93a94b1d34a3 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
@@ -26,7 +26,6 @@ import paddle.nn.functional as F
 import paddle.static as static
 import paddle.utils as utils
 from paddle.distributed.fleet import auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 logging.getLogger().setLevel(logging.INFO)
 paddle.enable_static()
@@ -42,8 +41,12 @@ class MLPLayer(nn.Layer):
         np.random.seed(2021)
         arr0 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
         arr1 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model))
-        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
-        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
+        weight_attr0 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr0)
+        )
+        weight_attr1 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr1)
+        )
         bias_attr = None
         self.linear0 = nn.Linear(
             d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
index 1ea69dfbb15699fff2a7cd142e6eadf2cb0f604a..8629a3e185297640ddb0f2b892276b0fe600f3dd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -203,8 +203,8 @@ class BertModelLayer(Layer):
         self._sent_emb_name = "sent_embedding"
         self._dtype = "float16" if use_fp16 else "float32"
 
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range']
+        self._param_initializer = paddle.nn.initializer.TruncatedNormal(
+            std=config['initializer_range']
         )
         paddle.set_default_dtype(self._dtype)
         self._src_emb = paddle.nn.Embedding(
@@ -317,8 +317,8 @@ class PretrainModelLayer(Layer):
         self._prepostprocess_dropout = config['hidden_dropout_prob']
 
         self._word_emb_name = "word_embedding"
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range']
+        self._param_initializer = paddle.nn.initializer.TruncatedNormal(
+            std=config['initializer_range']
         )
         self._weight_sharing = weight_sharing
         self.use_fp16 = use_fp16
@@ -343,7 +343,7 @@ class PretrainModelLayer(Layer):
 
         self.mask_lm_out_bias_attr = fluid.ParamAttr(
             name="mask_lm_out_fc.b_0",
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
 
         if not self._weight_sharing:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index 783dfff262e8f30ac5c5efabd1b72154c8ef1709..1e7950c29e222484645004c734b6783e1c427c65 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -41,7 +41,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
             padding=padding,
             groups=groups,
             weight_attr=ParamAttr(
-                initializer=fluid.initializer.Normal(0.0, 0.02)
+                initializer=paddle.nn.initializer.Normal(0.0, 0.02)
             ),
             bias_attr=False,
         )
@@ -49,11 +49,11 @@ class ConvBNLayer(fluid.dygraph.Layer):
             num_channels=ch_out,
             is_test=is_test,
             param_attr=ParamAttr(
-                initializer=fluid.initializer.Normal(0.0, 0.02),
+                initializer=paddle.nn.initializer.Normal(0.0, 0.02),
                 regularizer=L2Decay(0.0),
             ),
             bias_attr=ParamAttr(
-                initializer=fluid.initializer.Constant(0.0),
+                initializer=paddle.nn.initializer.Constant(0.0),
                 regularizer=L2Decay(0.0),
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index 99d90a7f1eaa50eff871f9c50d3d30bd03f6f1f6..88581c023f3d96e67a7baf4c2c00790db78f4e5f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -26,10 +26,8 @@ from paddle.nn import Embedding
 
 INF = 1.0 * 1e5
 alpha = 0.6
-uniform_initializer = lambda x: fluid.initializer.UniformInitializer(
-    low=-x, high=x
-)
-zero_constant = fluid.initializer.Constant(0.0)
+uniform_initializer = lambda x: paddle.nn.initializer.Uniform(low=-x, high=x)
+zero_constant = paddle.nn.initializer.Constant(0.0)
 
 
 class BasicLSTMUnit(Layer):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index c76b4dba9cb8e5e701e338a233c52497da62e981..7f93c83b91433089d9b765ccd075ef48e753a07d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -17,9 +17,9 @@ from functools import reduce
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.param_attr as attr
+from paddle.common_ops_import import Variable
 from paddle.fluid.dygraph import Layer
 from paddle.jit.api import to_static
-from paddle.static import Variable
 
 
 class EmbeddingLayer:
@@ -48,7 +48,8 @@ class EmbeddingLayer:
             sparse=True,
             padding_idx=self.padding_idx,
             weight_attr=attr.ParamAttr(
-                name=self.name, initializer=fluid.initializer.Xavier()
+                name=self.name,
+                initializer=paddle.nn.initializer.XavierUniform(),
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index 99fe330c692410d8a9db25750866a158b169ffd9..d8c5956357827c767c7323baf3d5b26255203d8b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -15,7 +15,7 @@
 from functools import reduce
 
 import paddle
-from paddle.static import Variable
+from paddle.common_ops_import import Variable
 
 
 class EmbeddingLayer:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 1f589b8d6fc8b4b9fa97597f2efd536c0c461e34..e1aaeabd48b8fdec4da8a63d98bac3e216ff49e5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -118,10 +118,10 @@ def dyfunc_BilinearTensorProduct(layer1, layer2):
         4,
         1000,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)
+            initializer=paddle.nn.initializer.Constant(value=0.99)
         ),
         bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)
+            initializer=paddle.nn.initializer.Constant(value=0.5)
         ),
     )
 
@@ -138,10 +138,10 @@ def dyfunc_Conv2D(input):
         out_channels=2,
         kernel_size=3,
         weight_attr=paddle.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)
+            initializer=paddle.nn.initializer.Constant(value=0.99)
         ),
         bias_attr=paddle.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)
+            initializer=paddle.nn.initializer.Constant(value=0.5)
         ),
     )
     res = conv2d(input)
@@ -170,10 +170,10 @@ def dyfunc_Conv2DTranspose(input):
         12,
         12,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)
+            initializer=paddle.nn.initializer.Constant(value=0.99)
         ),
         bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)
+            initializer=paddle.nn.initializer.Constant(value=0.5)
         ),
     )
     ret = conv2dTranspose(input)
@@ -222,7 +222,7 @@ def dyfunc_Pool2D(input):
 def dyfunc_Prelu(input):
     prelu0 = paddle.nn.PReLU(
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(1.0)
+            initializer=paddle.nn.initializer.Constant(1.0)
         ),
     )
     res = prelu0(input)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index a6a9d7281208dca73a081983c3bced0bc452227b..55a93f769e25c2e617c51ff1cdec5ba44bb82e29 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -115,11 +115,11 @@ class Conv1D(fluid.dygraph.Layer):
         k = 1.0 / math.sqrt(fan_in)
         param_attr = ParamAttr(
             name=prefix + "_w",
-            initializer=fluid.initializer.Uniform(low=-k, high=k),
+            initializer=paddle.nn.initializer.Uniform(low=-k, high=k),
         )
         bias_attr = ParamAttr(
             name=prefix + "_b",
-            initializer=fluid.initializer.Uniform(low=-k, high=k),
+            initializer=paddle.nn.initializer.Uniform(low=-k, high=k),
         )
 
         self._conv2d = paddle.nn.Conv2D(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index c14631c35b6b41bc57f2062afa011d9282c2fc18..59df33e5aa9e7a56c76a2f33e61ac4bfe28d9b65 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -125,10 +125,10 @@ class MyConvLayer(fluid.dygraph.Layer):
             out_channels=2,
             kernel_size=3,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.99)
+                initializer=paddle.nn.initializer.Constant(value=0.99)
             ),
             bias_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)
+                initializer=paddle.nn.initializer.Constant(value=0.5)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
index 0701750e3011a7ac5fbf96fbe251a70ac67bca36..b3556f08101973023a243d3702a29996adf5d0b5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -352,7 +352,7 @@ class conv2d(fluid.dygraph.Layer):
             con_bias_attr = False
         else:
             con_bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(0.0)
+                initializer=paddle.nn.initializer.Constant(0.0)
             )
 
         self.conv = paddle.nn.Conv2D(
@@ -362,9 +362,7 @@ class conv2d(fluid.dygraph.Layer):
             stride=stride,
             padding=padding,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=stddev
-                )
+                initializer=paddle.nn.initializer.Normal(mean=0.0, std=stddev)
             ),
             bias_attr=con_bias_attr,
         )
@@ -378,10 +376,10 @@ class conv2d(fluid.dygraph.Layer):
                 use_global_stats=True,  # set True to use deterministic algorithm
                 num_channels=num_filters,
                 param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.NormalInitializer(1.0, 0.02)
+                    initializer=paddle.nn.initializer.Normal(1.0, 0.02)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(0.0)
+                    initializer=paddle.nn.initializer.Constant(0.0)
                 ),
                 trainable_statistics=True,
             )
@@ -421,7 +419,7 @@ class DeConv2D(fluid.dygraph.Layer):
             de_bias_attr = False
         else:
             de_bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(0.0)
+                initializer=paddle.nn.initializer.Constant(0.0)
             )
 
         self._deconv = paddle.nn.Conv2DTranspose(
@@ -431,9 +429,7 @@ class DeConv2D(fluid.dygraph.Layer):
             stride=stride,
             padding=padding,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=stddev
-                )
+                initializer=paddle.nn.initializer.Normal(mean=0.0, std=stddev)
             ),
             bias_attr=de_bias_attr,
         )
@@ -444,10 +440,10 @@ class DeConv2D(fluid.dygraph.Layer):
                 use_global_stats=True,  # set True to use deterministic algorithm
                 num_channels=num_filters,
                 param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.NormalInitializer(1.0, 0.02)
+                    initializer=paddle.nn.initializer.Normal(1.0, 0.02)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(0.0)
+                    initializer=paddle.nn.initializer.Constant(0.0)
                 ),
                 trainable_statistics=True,
             )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fallback.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fallback.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2638f3a42d5cb209f276a7c32f235d45ebbedf8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fallback.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+def support_func(x):
+    return 2 * x
+
+
+def unsupport_func(x):
+    x = 2 * x
+    t = x.numpy()
+    t = np.ones(t)
+    return paddle.to_tensor(t)
+
+
+class SuppportNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return support_func(x)
+
+
+class UnsuppportNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        if self.training:
+            return unsupport_func(x)
+        else:
+            return unsupport_func(x - 1)
+
+
+class TestFallback(unittest.TestCase):
+    def setUp(self):
+        self.x = paddle.to_tensor(2).astype('int')
+
+    def tearDown(self):
+        pass
+
+    def test_case_support(self):
+        output = paddle.jit.to_static(support_func)(self.x)
+        np.testing.assert_allclose(output.numpy(), 4)
+
+    def test_case_func_fallback(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.build_cinn_pass = True
+        output = paddle.jit.to_static(
+            unsupport_func, build_strategy=build_strategy
+        )(self.x)
+        np.testing.assert_allclose(output.numpy(), unsupport_func(self.x))
+
+    def test_case_net_fallback(self):
+        s_net = SuppportNet()
+        u_net = UnsuppportNet()
+        np.testing.assert_allclose(
+            paddle.jit.to_static(s_net)(self.x).numpy(), 4
+        )
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.build_cinn_pass = True
+        np.testing.assert_allclose(
+            paddle.jit.to_static(u_net, build_strategy=build_strategy)(
+                self.x
+            ).numpy(),
+            u_net(self.x).numpy(),
+        )
+
+    def test_case_net_error(self):
+        s_net = SuppportNet()
+        u_net = UnsuppportNet()
+        np.testing.assert_allclose(
+            paddle.jit.to_static(s_net)(self.x).numpy(), 4
+        )
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.build_cinn_pass = False
+        with self.assertRaises(TypeError):
+            np.testing.assert_allclose(
+                paddle.jit.to_static(u_net, build_strategy=build_strategy)(
+                    self.x
+                ).numpy(),
+                u_net(self.x).numpy(),
+            )
+
+    def test_case_training(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.build_cinn_pass = True
+        u_net = paddle.jit.to_static(
+            UnsuppportNet(), build_strategy=build_strategy
+        )
+        u_net.eval()
+        np.testing.assert_allclose(u_net(self.x).numpy(), [1, 1])
+        assert u_net.training is False, "Training must be false."
+
+    def test_case_save_error(self):
+        """
+        test the save will raise error.
+        """
+        u_net = UnsuppportNet()
+        u_net = paddle.jit.to_static(
+            u_net, input_spec=[paddle.static.InputSpec(name='x', shape=[1])]
+        )
+        with self.assertRaises(TypeError):
+            paddle.jit.save(u_net, path="model")
+
+    def test_case_save_error_2(self):
+        """
+        test the save will raise error.
+        """
+        u_net = UnsuppportNet()
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.build_cinn_pass = True
+        u_net = paddle.jit.to_static(u_net, build_strategy=build_strategy)
+        u_net(self.x)
+        with self.assertRaises(RuntimeError):
+            print(u_net.forward.main_program)
+
+    def test_case_flag(self):
+        """
+        test the flags is working. TODO: add a global flags.
+        """
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 94e1dba49313a0dd2092ba2199a845e66cc106be..0d108b40406bafb4025c9e9c6ba019b10ee71efa 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -98,7 +98,7 @@ class BiGRU(fluid.dygraph.Layer):
             in_features=input_dim,
             out_features=grnn_hidden_dim * 3,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -111,7 +111,7 @@ class BiGRU(fluid.dygraph.Layer):
             size=grnn_hidden_dim,
             h_0=h_0,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -124,7 +124,7 @@ class BiGRU(fluid.dygraph.Layer):
             in_features=input_dim,
             out_features=grnn_hidden_dim * 3,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -138,7 +138,7 @@ class BiGRU(fluid.dygraph.Layer):
             is_reverse=True,
             h_0=h_0,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -375,7 +375,7 @@ class LexNet(fluid.dygraph.Layer):
             weight_attr=fluid.ParamAttr(
                 learning_rate=self.emb_lr,
                 name="word_emb",
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound
                 ),
             ),
@@ -415,7 +415,7 @@ class LexNet(fluid.dygraph.Layer):
             in_features=self.grnn_hidden_dim * 2,
             out_features=self.num_labels,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index d708dc1eadfedd2a14f9cc08782325eb2c4ee560..72f3dd7c33190af4805e68e16a4f3f525d996349 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -22,7 +22,6 @@ from predictor_utils import PredictorTools
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.jit.api import to_static
 from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -61,7 +60,8 @@ class ConvBNLayer(fluid.dygraph.Layer):
             padding=padding,
             groups=num_groups,
             weight_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"
+                initializer=paddle.nn.initializer.KaimingUniform(),
+                name=self.full_name() + "_weights",
             ),
             bias_attr=False,
         )
@@ -259,7 +259,8 @@ class MobileNetV1(fluid.dygraph.Layer):
             int(1024 * scale),
             class_dim,
             weight_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "fc7_weights"
+                initializer=paddle.nn.initializer.KaimingUniform(),
+                name=self.full_name() + "fc7_weights",
             ),
             bias_attr=ParamAttr(name="fc7_offset"),
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 53687ca6c1ea5b1cb59a28fc2fe462b198414430..1099f2dad667a18c587f0562f5bd85095de4e38b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -49,26 +49,26 @@ class SimpleLSTMRNN(fluid.Layer):
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -158,7 +158,7 @@ class PtbModel(fluid.Layer):
             sparse=False,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -167,7 +167,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -175,7 +175,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 911ca2ec9016f122b0ed16abd506fda22d4aaccd..407e11349c2de1b52269cc6fa4dc6480c829522b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -194,7 +194,7 @@ class ResNet(fluid.dygraph.Layer):
             self.pool2d_avg_output,
             class_dim,
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index e01b77af7655b534818b3f5bfbf8a8b51a97411d..723a7c742c198921cd9bf68785a7a85af0d8ebcd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -131,7 +131,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
             num_channels,
             num_channels // reduction_ratio,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
         stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
@@ -139,7 +139,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
             num_channels // reduction_ratio,
             num_channels,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
@@ -316,7 +316,7 @@ class SeResNeXt(fluid.dygraph.Layer):
             self.pool2d_avg_output,
             class_dim,
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
index 560132565907e5b3cd3e53a109236ed05760a127..5eb7cfc1080c76518b1d7e82cd74f448d5776e6a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -230,7 +230,7 @@ class SkipGram(fluid.dygraph.Layer):
             self.embedding_size,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-0.5 / self.embedding_size,
                     high=0.5 / self.embedding_size,
                 ),
@@ -242,7 +242,7 @@ class SkipGram(fluid.dygraph.Layer):
             self.embedding_size,
             weight_attr=fluid.ParamAttr(
                 name='embedding_out_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-0.5 / self.embedding_size,
                     high=0.5 / self.embedding_size,
                 ),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 18afc4a4ab9d5fad7cf81a261175c6db44a73388..3928c715a62883fc0ae1539c31b0582c8fee5a0e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -62,10 +62,10 @@ class PrePostProcessLayer(Layer):
                         paddle.nn.LayerNorm(
                             normalized_shape=d_model,
                             weight_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(1.0)
+                                initializer=paddle.nn.initializer.Constant(1.0)
                             ),
                             bias_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(0.0)
+                                initializer=paddle.nn.initializer.Constant(0.0)
                             ),
                         ),
                     )
@@ -295,7 +295,7 @@ class Embedder(Layer):
             vocab_size,
             emb_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Normal(0.0, emb_dim**-0.5)
+                initializer=paddle.nn.initializer.Normal(0.0, emb_dim**-0.5)
             ),
         )
 
@@ -330,7 +330,7 @@ class WrapEncoder(Layer):
             max_length,
             self.emb_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     position_encoding_init(max_length, self.emb_dim)
                 ),
                 trainable=False,
@@ -522,7 +522,7 @@ class WrapDecoder(Layer):
             max_length,
             self.emb_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     position_encoding_init(max_length, self.emb_dim)
                 ),
                 trainable=False,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 10df42faa2373c1bc59d93d7d78fb4074f4fcc24..dbfc43cfc243223c47ec39234e803be0683af83f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -253,10 +253,10 @@ class YOLOv3(fluid.dygraph.Layer):
                     stride=1,
                     padding=0,
                     weight_attr=ParamAttr(
-                        initializer=fluid.initializer.Normal(0.0, 0.02)
+                        initializer=paddle.nn.initializer.Normal(0.0, 0.02)
                     ),
                     bias_attr=ParamAttr(
-                        initializer=fluid.initializer.Constant(0.0),
+                        initializer=paddle.nn.initializer.Constant(0.0),
                         regularizer=L2Decay(0.0),
                     ),
                 ),
diff --git a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
index 917beec752d2aa080614d04d8aeadd010a90308b..895f71c4858e999fb451bcafd21885fabb05131d 100644
--- a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
+++ b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
@@ -70,7 +70,7 @@ def net(batch_size=4, lr=0.01):
             size=[dnn_input_dim, dnn_layer_dims[0]],
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
@@ -86,7 +86,7 @@ def net(batch_size=4, lr=0.01):
             size=[lr_input_dim, 1],
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
@@ -99,7 +99,7 @@ def net(batch_size=4, lr=0.01):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index d456a86aa9d28cdbded3da5b40be6036cf5f9777..521e24779d0f05c718c01be7d5a3d5b403db2c4a 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -18,15 +18,6 @@ string(REPLACE ".py" "" TEST_TRT_CONVERTER "${TEST_TRT_CONVERTER}")
 
 if(NOT WITH_DISTRIBUTE)
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_delete_c_identity_op_pass")
-  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
-       "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
-       "test_trt_convert_preln_residual_no_bias")
-  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_preln_residual_no_bias")
-  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_preln_residual_no_bias")
-
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_trt_convert_c_allreduce")
   list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_c_allreduce")
   list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_c_allreduce")
@@ -39,6 +30,7 @@ if(WIN32)
        "test_preln_groupnorm_act_fuse_pass")
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
        "test_element_groupnorm_act_fuse_pass")
+  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_groupnorm_act_pass_fuse_pass")
   list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_fused_token_prune")
   list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_fused_token_prune")
 endif()
@@ -193,6 +185,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
                          PROPERTIES TIMEOUT 120)
     set_tests_properties(test_conv_elementwise_add_act_fuse_pass
                          PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_onednn_conv_bias_fuse_pass PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_onednn_conv_concat_activation_fuse_pass
+                         PROPERTIES TIMEOUT 300)
     set_tests_properties(test_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
     set_tests_properties(test_squeeze2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
     set_tests_properties(test_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
@@ -225,11 +220,13 @@ if(WITH_GPU AND TENSORRT_FOUND)
                            PROPERTIES TIMEOUT 120)
       set_tests_properties(test_preln_groupnorm_act_fuse_pass PROPERTIES TIMEOUT
                                                                          120)
+      set_tests_properties(test_groupnorm_act_pass_fuse_pass PROPERTIES TIMEOUT
+                                                                        120)
     endif()
   endif()
 
   if(WITH_MKLDNN)
-    set_tests_properties(test_mkldnn_conv_elementwise_add_fuse_pass
+    set_tests_properties(test_onednn_conv_elementwise_add_fuse_pass
                          PROPERTIES TIMEOUT 120)
     set_tests_properties(test_mkldnn_depthwise_conv_pass PROPERTIES TIMEOUT 120)
     set_tests_properties(test_onednn_reshape_transpose_matmul_fuse_pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index d48091f6c10c12b003ddff51e8b2d99c742d3697..1d2b442d2dd05b819c4e7beebba03a6c6febf2cb 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -27,7 +27,6 @@ from paddle.fluid.framework import (
     Operator,
     convert_np_dtype_to_dtype_,
 )
-from paddle.fluid.initializer import NumpyArrayInitializer
 from paddle.static.quantization import (
     QuantizationFreezePass,
     QuantizationTransformPass,
@@ -305,7 +304,7 @@ def create_fake_model(program_config):
             shape=tensor_config.shape,
             type=core.VarDesc.VarType.LOD_TENSOR,
             name=name,
-            initializer=NumpyArrayInitializer(tensor_config.data),
+            initializer=paddle.nn.initializer.Assign(tensor_config.data),
         )
     in_vars = []
     for name in sorted(save_var_map.keys()):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_groupnorm_act_pass_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_groupnorm_act_pass_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f821b21d4e93fb3e366f54c555278279cf2643
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_groupnorm_act_pass_fuse_pass.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+
+import hypothesis.strategies as st
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+import paddle.inference as paddle_infer
+
+
+class TestElementGNActPass(PassAutoScanTest):
+    #
+    #             |             fuse                 |
+    #          groupnorm         ->          groupnorm(with_silu)
+    #             |                                  |
+    #            silu
+    #             |
+    #
+    #
+
+    def sample_predictor_configs(self, program_config):
+        # trt dynamic_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=1,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Half,
+            use_static=False,
+            use_calib_mode=False,
+        )
+        config.set_trt_dynamic_shape_info(
+            {
+                "input_data": [1, 160, 1, 1],
+            },
+            {
+                "input_data": [4, 1280, 64, 64],
+            },
+            {
+                "input_data": [1, 320, 32, 32],
+            },
+        )
+        yield config, ['group_norm'], (3e-3, 1e-3)
+
+    def sample_program_config(self, draw):
+        axis = draw(st.sampled_from([0, -1]))
+        epsilon = draw(st.floats(min_value=0.0000001, max_value=0.001))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        groups = draw(st.sampled_from([4, 8, 16, 32]))
+        hw = draw(st.sampled_from([1, 8, 16, 32]))
+        channel = draw(st.sampled_from([320, 1280]))
+
+        def generate_input(attrs):
+            return np.random.random(
+                [attrs[1]["batch_size"], *attrs[1]["input_dim"]]
+            ).astype(np.float32)
+
+        def generate_weight(attrs):
+            return np.random.random(attrs[1]['input_dim'][0]).astype(np.float32)
+
+        attrs = [
+            {
+                'epsilon': epsilon,
+                'groups': groups,
+            },
+            {
+                'batch_size': batch_size,
+                'input_dim': [channel, hw, hw],
+            },
+        ]
+
+        group_norm_op = OpConfig(
+            type="group_norm",
+            inputs={
+                "X": ["input_data"],
+                "Bias": ["group_norm_bias"],
+                "Scale": ["group_norm_scale"],
+            },
+            outputs={
+                "Y": ["group_norm_output1"],
+                "Mean": ["group_norm_output2"],
+                "Variance": ["group_norm_output3"],
+            },
+            attrs={
+                "data_layout": "NCHW",
+                "groups": attrs[0]["groups"],
+                "epsilon": attrs[0]["epsilon"],
+            },
+        )
+        silu_op = OpConfig(
+            type="silu",
+            inputs={
+                "X": ["group_norm_output1"],
+            },
+            outputs={
+                "Out": ["silu_output"],
+            },
+        )
+
+        program_config = ProgramConfig(
+            ops=[
+                group_norm_op,
+                silu_op,
+            ],
+            weights={
+                "group_norm_bias": TensorConfig(
+                    data_gen=partial(generate_weight, attrs)
+                ),
+                "group_norm_scale": TensorConfig(
+                    data_gen=partial(generate_weight, attrs)
+                ),
+            },
+            inputs={
+                "input_data": TensorConfig(
+                    data_gen=partial(generate_input, attrs)
+                ),
+            },
+            outputs=["silu_output"],
+        )
+
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=50,
+            passes=["groupnorm_act_pass"],
+            max_duration=250,
+            min_success_num=50,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
index e4f38b45fa24adfd4614afc9658ca25b288dc37c..1ef1cb9d2af3792894e87d8b17a8cb8ad1c2caef 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -103,12 +103,6 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
             alpha=alpha,
             trans_x=transpose_X,
             trans_y=transpose_Y,
-            fused_reshape_Out=[],
-            fused_transpose_Out=[],
-            fused_reshape_X=[],
-            fused_reshape_Y=[],
-            fused_transpose_X=[],
-            fused_transpose_Y=[],
         )
 
         ops = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
index efd12c856592f1a5c65d0d506e6c114ed2f66cdb..129103d1bc6aa2ca479c3be264678fff4f20b7e9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -92,12 +92,6 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
             alpha=alpha,
             trans_x=transpose_X,
             trans_y=transpose_Y,
-            fused_reshape_Out=[],
-            fused_transpose_Out=[],
-            fused_reshape_X=[],
-            fused_reshape_Y=[],
-            fused_transpose_X=[],
-            fused_transpose_Y=[],
         )
 
         ops = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
index 1bfb629cf965cccc64a9859c1033b0c4dbc8f100..dee099954626b5b0188045a3c933466529008290 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -76,12 +76,6 @@ class TestMatmulV2ScaleFusePass(PassAutoScanTest):
             outputs={"Out": ["matmul_out"]},
             trans_x=transpose_X,
             trans_y=transpose_Y,
-            fused_reshape_X=[],
-            fused_reshape_Y=[],
-            fused_transpose_X=[],
-            fused_transpose_Y=[],
-            fused_reshape_Out=[],
-            fused_transpose_Out=[],
         )
         is_scale_tensor = draw(st.booleans())
         if is_scale_tensor:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
deleted file mode 100644
index 1f32de177e3ee4bb350e80da1d65d5276d448ffc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from inference_pass_test import InferencePassTest
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.core import PassVersionChecker
-
-
-# padding SAME
-class ConvBiasMkldnnFusePassSamePadTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32"
-            )
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001,
-            )
-            conv_out = paddle.static.nn.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="SAME",
-                bias_attr=param_attr,
-            )
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
-        self.assertTrue(
-            PassVersionChecker.IsCompatible("conv_bias_mkldnn_fuse_pass")
-        )
-
-
-# padding VALID
-class ConvBiasMkldnnFusePassValidPadTest(ConvBiasMkldnnFusePassSamePadTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32"
-            )
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001,
-            )
-            conv_out = paddle.static.nn.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="VALID",
-                bias_attr=param_attr,
-            )
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-
-# padding EXPLICT NUMBER
-class ConvBiasMkldnnFusePassExplictPadTest(ConvBiasMkldnnFusePassSamePadTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32"
-            )
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001,
-            )
-            conv_out = paddle.static.nn.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding=[2, 4, 6, 8],
-                bias_attr=param_attr,
-            )
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-
-class ConvBiasMkldnnFusePassGroupTest(ConvBiasMkldnnFusePassSamePadTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32"
-            )
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001,
-            )
-            conv_out = paddle.static.nn.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="VALID",
-                groups=3,
-                bias_attr=param_attr,
-                use_cudnn=False,
-                act="softmax",
-                data_format="NCHW",
-            )
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-
-class ConvBiasMkldnnFusePassDialtionsGroupsTest(
-    ConvBiasMkldnnFusePassSamePadTest
-):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32"
-            )
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001,
-            )
-            conv_out = paddle.static.nn.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="VALID",
-                dilation=2,
-                groups=3,
-                bias_attr=param_attr,
-                use_cudnn=False,
-                act="softmax",
-                data_format="NCHW",
-            )
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-
-class ConvTransposeMkldnnFusePassDialtionsGroupsTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(name="data", shape=[-1, 3, 5, 5], dtype="float32")
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001,
-            )
-            conv_out = paddle.static.nn.conv2d_transpose(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="SAME",
-                dilation=1,
-                bias_attr=param_attr,
-                use_cudnn=False,
-            )
-
-        self.feeds = {"data": np.random.random((1, 3, 5, 5)).astype("float32")}
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
-        self.assertTrue(
-            PassVersionChecker.IsCompatible(
-                "conv_transpose_bias_mkldnn_fuse_pass"
-            )
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
index 964aad16b971107ca92a1d41355263c1b7030a60..2b64a6be86f74099eeaf76fa81f4ba7182634273 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
                 'operator_scale_onednn_fuse_pass',
             ],
         )
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
index 0e0c542be632c3546640d885bb165579af7c756d..3d99e057d79217a59caa9ce92abeaa891d747324 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest):
                 'matmul_activation_mkldnn_fuse_pass',
             ],
         )
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
index b359d4a4c93c43056550e7cd0f0654502451c4ee..c84c9f02ce0378062540399902fa8656c5a9ac4b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest):
         config = self.create_inference_config(
             use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
         )
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
index 3694041af10a85698c1de22dfe3af284308634a2..0b643b9061d04e1ab23ae4bcdec3222bb8d93051 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -150,7 +150,7 @@ class TestMatmulv2ActivationMkldnnFusePass(PassAutoScanTest):
                 'operator_scale_onednn_fuse_pass',
             ],
         )
-        yield config, ['matmul_v2'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
index f81a0cce52d29f471613785dcdddf131462acd56..e667c10fe6a0359cde7b40379531f2ac3993a721 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ class TestMatmulV2ElementwiseAddMkldnnFusePass(PassAutoScanTest):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['matmul_v2'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
index 2d368433edc3cafa3a8b661f633493bb44ab0af5..45b17d59aeba5f676d216ad39610eae91a17bce0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -90,12 +90,6 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
             attrs={
                 "trans_x": transpose_X,
                 "trans_y": transpose_Y,
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": [],
             },
         )
 
@@ -135,17 +129,8 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        # gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op
-        fused_op = "matmul_v2"
-        input1_dim1 = program_config.inputs["input_data1"].shape[0]
-        input2_dim1 = program_config.inputs["input_data2"].shape[0]
-        input1_dim2 = program_config.inputs["input_data1"].shape[1]
-        input2_dim2 = program_config.inputs["input_data2"].shape[1]
-        if input1_dim1 == input2_dim1 and input1_dim2 == input2_dim2:
-            fused_op = "matmul"
-
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, [fused_op], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
index 2fd966710b488fba9c9bb7fd66e37dfb9eb47cbe..9a72e806b322682650eb353febdc7ca430e6a2e5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -93,12 +93,6 @@ class TestMkldnnMatmulv2Op(MkldnnAutoScanTest):
             attrs={
                 "trans_x": kwargs["transpose_X"],
                 "trans_y": kwargs["transpose_Y"],
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": [],
             },
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_bias_fuse_pass.py
old mode 100755
new mode 100644
similarity index 58%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_bias_fuse_pass.py
index 7efea770bfa2a5c8a034777eab105d5028fb224f..7a94a12888286573c07524bd0a63416c6bd42b22
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_bias_fuse_pass.py
@@ -19,67 +19,43 @@ from auto_scan_test import PassAutoScanTest
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestConvBiasMkldnnFusePass(PassAutoScanTest):
-    r"""
-    x_var   f_var(persistable)
-      \       /
-        conv2d
-          |
-      conv2d_var  bias_var(persistable)
-              \      /
-           elementwise_add
-                |
-         elementwise_add_var
-    """
-
+class TestConvBiasOneDNNFusePass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
-        # MKLDNN
         config = self.create_inference_config(use_gpu=False)
         config.enable_mkldnn()
-        yield config, ["fused_conv2d"], (1e-4, 1e-5)
+        yield config, ['fused_conv2d'], (1e-4, 1e-5)
 
     def is_program_valid(self, prog_config):
-        paddings = prog_config.ops[0].attrs["paddings"]
-        strides = prog_config.ops[0].attrs["strides"]
-        groups = prog_config.ops[0].attrs["groups"]
-        padding_algorithm = prog_config.ops[0].attrs["padding_algorithm"]
-        dilations = prog_config.ops[0].attrs["dilations"]
-        data_format = prog_config.ops[0].attrs["data_format"]
-        filter_shape = prog_config.weights["filter"].shape
-        input_shape = prog_config.inputs["input_x"].shape
-        if padding_algorithm == "VALID":
-            if (
-                (input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1))
-                / strides[0]
-                + 1
-            ) <= 1 or (
-                (input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1))
-                / strides[1]
-                + 1
-            ) <= 1:
+        paddings = prog_config.ops[0].attrs['paddings']
+        groups = prog_config.ops[0].attrs['groups']
+        padding_algorithm = prog_config.ops[0].attrs['padding_algorithm']
+        dilations = prog_config.ops[0].attrs['dilations']
+        data_format = prog_config.ops[0].attrs['data_format']
+        filter_shape = prog_config.weights['filter'].shape
+        input_shape = prog_config.inputs['input_x'].shape
+        height = input_shape[data_format.index('H')]
+        width = input_shape[data_format.index('W')]
+        if padding_algorithm == 'VALID':
+            if (height - (dilations[0] * (filter_shape[2] - 1) + 1) <= 0) or (
+                width - (dilations[1] * (filter_shape[3] - 1) + 1) <= 0
+            ):
                 return False
-        if padding_algorithm == "EXPLICIT":
+        if padding_algorithm == 'EXPLICIT':
             if (
-                (
-                    input_shape[2]
-                    + paddings[0]
-                    + paddings[1]
-                    - (dilations[0] * (filter_shape[2] - 1) + 1)
-                )
-                / strides[0]
-                + 1
-            ) <= 1 or (
-                (
-                    input_shape[3]
-                    + paddings[2]
-                    + paddings[3]
-                    - (dilations[1] * (filter_shape[3] - 1) + 1)
-                )
-                / strides[1]
-                + 1
-            ) <= 1:
+                height
+                + paddings[0]
+                + paddings[1]
+                - (dilations[0] * (filter_shape[2] - 1) + 1)
+                <= 0
+            ) or (
+                width
+                + paddings[2]
+                + paddings[3]
+                - (dilations[1] * (filter_shape[3] - 1) + 1)
+                <= 0
+            ):
                 return False
-        if data_format == "NCHW":
+        if data_format == 'NCHW':
             if input_shape[1] != filter_shape[1] * groups:
                 return False
             if filter_shape[0] % groups != 0:
@@ -101,7 +77,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
         x_shape[1] = draw(st.integers(min_value=5, max_value=10))
 
         # 2. Generate legal attr:data_format of conv2d
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        data_format = draw(st.sampled_from(['NCHW', 'NHWC']))
 
         # 3. Generate legal shape of input:Y of conv2d
         f_shape = draw(
@@ -109,7 +85,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
                 st.integers(min_value=1, max_value=4), min_size=4, max_size=4
             )
         )
-        if data_format == "NCHW":
+        if data_format == 'NCHW':
             f_shape[1] = x_shape[1]
         else:
             f_shape[1] = x_shape[3]
@@ -122,7 +98,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
         )
 
         # 5. Generate legal attr:padding_algorithm of conv2d
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        padding_algorithm = draw(st.sampled_from(['EXPLICIT', 'SAME', 'VALID']))
 
         # 6. Generate legal attr:padding of conv2d
         padding = draw(
@@ -146,7 +122,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
 
         # 10. Generate legal shape of attr:axis of elementwise_add
         axis = 1
-        if data_format == "NCHW":
+        if data_format == 'NCHW':
             axis = 1
         else:
             axis = 3
@@ -156,36 +132,36 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
         inputs = dict()
         weights = dict()
         use_mkldnn = None
-        conv_type = "conv2d"
+        conv_type = 'conv2d'
         if draw(st.booleans()):
             conv_bias_shape = [f_shape[0]]
-            conv_type = "fused_conv2d"
+            conv_type = 'fused_conv2d'
             inputs = {
-                "Input": ["input_x"],
-                "Filter": ["filter"],
-                "Bias": ["conv_bias"],
+                'Input': ['input_x'],
+                'Filter': ['filter'],
+                'Bias': ['conv_bias'],
             }
             weights = {
-                "filter": TensorConfig(shape=f_shape),
-                "bias": TensorConfig(shape=bias_shape),
-                "conv_bias": TensorConfig(shape=conv_bias_shape),
+                'filter': TensorConfig(shape=f_shape),
+                'bias': TensorConfig(shape=bias_shape),
+                'conv_bias': TensorConfig(shape=conv_bias_shape),
             }
             use_mkldnn = True
         else:
             inputs = {
-                "Input": ["input_x"],
-                "Filter": ["filter"],
+                'Input': ['input_x'],
+                'Filter': ['filter'],
             }
             weights = {
-                "filter": TensorConfig(shape=f_shape),
-                "bias": TensorConfig(shape=bias_shape),
+                'filter': TensorConfig(shape=f_shape),
+                'bias': TensorConfig(shape=bias_shape),
             }
             use_mkldnn = False
 
         conv2d_op = OpConfig(
             conv_type,
             inputs=inputs,
-            outputs={"Output": ["conv2d_out"]},
+            outputs={'Output': ['conv2d_out']},
             strides=strides,
             padding_algorithm=padding_algorithm,
             paddings=padding,
@@ -196,9 +172,9 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
         )
 
         add_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv2d_out"], "Y": ["bias"]},
-            outputs={"Out": ["add_out"]},
+            'elementwise_add',
+            inputs={'X': ['conv2d_out'], 'Y': ['bias']},
+            outputs={'Out': ['add_out']},
             axis=axis,
         )
 
@@ -207,16 +183,16 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
         program_config = ProgramConfig(
             ops=ops,
             weights=weights,
-            inputs={"input_x": TensorConfig(shape=x_shape)},
-            outputs=ops[-1].outputs["Out"],
+            inputs={'input_x': TensorConfig(shape=x_shape)},
+            outputs=ops[-1].outputs['Out'],
         )
         return program_config
 
     def test(self):
         self.run_and_statis(
-            quant=False, max_examples=350, passes=["conv_bias_mkldnn_fuse_pass"]
+            quant=False, passes=['conv_bias_mkldnn_fuse_pass'], max_examples=130
         )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
index e49a9aee68005d06b7722d08877ea33a5bd6e978..d58993b4e6534ac6de1fc85724a854bb337d550c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
@@ -21,7 +21,7 @@ from auto_scan_test import PassAutoScanTest
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestConvConcatActivationMkldnnFusePass(PassAutoScanTest):
+class TestOneDNNConvConcatActivationFusePass(PassAutoScanTest):
     def sample_program_config(self, draw):
         data_format = draw(st.sampled_from(['NCHW', 'NHWC']))
         dilations = draw(st.sampled_from([[2, 2]]))
@@ -162,7 +162,9 @@ class TestConvConcatActivationMkldnnFusePass(PassAutoScanTest):
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=['conv_activation_mkldnn_fuse_pass']
+            quant=False,
+            passes=['conv_activation_mkldnn_fuse_pass'],
+            max_examples=50,
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
similarity index 59%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
index 7a49d9b58dfcdf2eec211d96f805854c54326657..b8c847460e0399ed5f19dda9f600b72b9d52949b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
@@ -21,22 +21,21 @@ from auto_scan_test import PassAutoScanTest
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-# the two inputs of elementwise_add are tensor
-class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+class TestOneDNNConvElementwiseAddFusePass(PassAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-        if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0:
+        if attrs[1]['data_format'] == 'NHWC' and attrs[3]['axis'] == 0:
             return False
-        if attrs[1]['data_format'] == "NCHW" and attrs[3]['axis'] == -1:
+        if attrs[1]['data_format'] == 'NCHW' and attrs[3]['axis'] == -1:
             return False
         return True
 
     def sample_program_config(self, draw):
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        data_format = draw(st.sampled_from(['NCHW', 'NHWC']))
         dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        padding_algorithm = draw(st.sampled_from(['EXPLICIT', 'SAME', 'VALID']))
         groups = draw(st.sampled_from([1, 2, 4]))
         paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]]))
         strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
@@ -44,7 +43,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
         batch_size = draw(st.integers(min_value=1, max_value=4))
 
         def generate_input():
-            if data_format == "NCHW":
+            if data_format == 'NCHW':
                 return np.random.random([batch_size, 48, 64, 64]).astype(
                     np.float32
                 )
@@ -59,44 +58,44 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
             )
 
         relu_op = OpConfig(
-            type="relu",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["relu_out"]},
+            type='relu',
+            inputs={'X': ['input_data']},
+            outputs={'Out': ['relu_out']},
             attrs={},
         )
 
         conv2d_op1 = OpConfig(
-            type="conv2d",
-            inputs={"Input": ["relu_out"], "Filter": ["conv_weight1"]},
-            outputs={"Output": ["conv_output1"]},
+            type='conv2d',
+            inputs={'Input': ['relu_out'], 'Filter': ['conv_weight1']},
+            outputs={'Output': ['conv_output1']},
             attrs={
-                "data_format": data_format,
-                "dilations": dilations,
-                "padding_algorithm": padding_algorithm,
-                "groups": groups,
-                "paddings": paddings,
-                "strides": strides,
+                'data_format': data_format,
+                'dilations': dilations,
+                'padding_algorithm': padding_algorithm,
+                'groups': groups,
+                'paddings': paddings,
+                'strides': strides,
             },
         )
 
         conv2d_op2 = OpConfig(
-            type="conv2d",
-            inputs={"Input": ["input_data"], "Filter": ["conv_weight2"]},
-            outputs={"Output": ["conv_output2"]},
+            type='conv2d',
+            inputs={'Input': ['input_data'], 'Filter': ['conv_weight2']},
+            outputs={'Output': ['conv_output2']},
             attrs={
-                "data_format": data_format,
-                "dilations": dilations,
-                "padding_algorithm": padding_algorithm,
-                "groups": groups,
-                "paddings": paddings,
-                "strides": strides,
+                'data_format': data_format,
+                'dilations': dilations,
+                'padding_algorithm': padding_algorithm,
+                'groups': groups,
+                'paddings': paddings,
+                'strides': strides,
             },
         )
 
         elt_op = OpConfig(
-            type="elementwise_add",
-            inputs={"X": ["conv_output1"], "Y": ["conv_output2"]},
-            outputs={"Out": ["elementwise_output"]},
+            type='elementwise_add',
+            inputs={'X': ['conv_output1'], 'Y': ['conv_output2']},
+            outputs={'Out': ['elementwise_output']},
             attrs={'axis': axis},
         )
 
@@ -105,26 +104,26 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
         program_config = ProgramConfig(
             ops=model_net,
             weights={
-                "conv_weight1": TensorConfig(data_gen=partial(generate_weight)),
-                "conv_weight2": TensorConfig(data_gen=partial(generate_weight)),
+                'conv_weight1': TensorConfig(data_gen=partial(generate_weight)),
+                'conv_weight2': TensorConfig(data_gen=partial(generate_weight)),
             },
             inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input))
+                'input_data': TensorConfig(data_gen=partial(generate_input))
             },
-            outputs=["elementwise_output"],
+            outputs=['elementwise_output'],
         )
 
         return program_config
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ["relu", "conv2d", "fused_conv2d"], (1e-5, 1e-5)
+        yield config, ['relu', 'conv2d', 'fused_conv2d'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
-            quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"]
+            quant=False, passes=['conv_elementwise_add_mkldnn_fuse_pass']
         )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
index 85cdfd314a7cf456fc938b8d602fa748490080fc..d7ad3f64162b30298081d3a1e8b98e3ed7d01546 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -116,7 +116,7 @@ class TestOneDNNMatmulTransposeReshapeFusePass(PassAutoScanTest):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
index 2f9051fe16b5c34546a1eb35e9b85ab725918d8c..ef5098b00704e2fb059a29b6bb241f3af87265cf 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -154,7 +154,7 @@ class TestOneDNNReshapeTransposeMatmulFusePass(PassAutoScanTest):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
index 9e17b83ab9c1ef9b931056662b4d0d181e0fab0f..a45ddfcae189e2b7814ee363902c16cfb1535268 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
@@ -158,11 +158,24 @@ class TrtConvertSkipLayernormTest(TrtLayerAutoScanTest):
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            return 1, 4
+            if dynamic_shape:
+                return 1, 4
+            else:
+                return 0, 5
 
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
+        # for static_shape, fall back to fluid fused op
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
 
         # just support dynamic_shape
         generate_dynamic_shape(attrs)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
index aef2142bf3e8ea08cd8369f7a22b75b26d130f31..fd3bdb64c7ede23db826ed60cb329bd9199f6668 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
@@ -146,12 +146,26 @@ class TrtConvertSkipLayernormTest(TrtLayerAutoScanTest):
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            return 1, 4
+            if dynamic_shape:
+                return 1, 4
+            else:
+                return 0, 5
 
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
+        # for static_shape, fall back to fluid fused op
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+
         # just support dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index da9a86725c008bef4673b7792a14d1bc688b9944..0e7eb56da9133d2ca5c4ff8fb194e53d00ec055f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -180,11 +180,11 @@ class TensorRTSubgraphPassInstanceNormTest(InferencePassTest):
             )
             param_attr = fluid.ParamAttr(
                 name='instance_norm_w',
-                initializer=fluid.initializer.Constant(value=1.0),
+                initializer=paddle.nn.initializer.Constant(value=1.0),
             )
             bias_attr = fluid.ParamAttr(
                 name='instance_norm_b',
-                initializer=fluid.initializer.Constant(value=0.0),
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             )
             out = paddle.static.nn.instance_norm(
                 input=data, param_attr=param_attr, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
index 8f74ceebb658671d4db477990533190a73bc2a83..c66ee864532883f1b1fc48aae2941824da5347b1 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
@@ -38,7 +38,7 @@ class PrelnResidualBiasFusePassTest(PassTest):
 
         self.fetch_list = [out, elementwise_out]
         self.pass_names = "preln_residual_bias_fuse_pass"
-        self.fused_op_type = "preln_residual_bias"
+        self.fused_op_type = "fused_bias_dropout_residual_layer_norm"
         self.num_fused_ops = 1
         # self.graph_attrs = {
         #     "embedding_eltwise_layernorm_fuse_pass_flag": True,
@@ -72,7 +72,7 @@ class PrelnResidualBiasFusePassNoBiasTest(PassTest):
 
         self.fetch_list = [out, elementwise_out]
         self.pass_names = "preln_residual_bias_fuse_pass"
-        self.fused_op_type = "preln_residual_bias"
+        self.fused_op_type = "fused_bias_dropout_residual_layer_norm"
         self.num_fused_ops = 1
 
     def test_check_program(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index 17672d668d38a4aebebdec6c7400856b173798a1..590ebbf63efa5fd986c93d7ad0cdc0ebf636ab0e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -114,11 +114,11 @@ class TestBatchNorm(unittest.TestCase):
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.0),
+                            initializer=paddle.nn.initializer.Constant(1.0),
                             trainable=False,
                         ),
                         bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.0),
+                            initializer=paddle.nn.initializer.Constant(0.0),
                             trainable=False,
                         ),
                         trainable_statistics=trainable_statistics,
@@ -262,7 +262,7 @@ class TestBatchNormUseGlobalStats(unittest.TestCase):
                 net1 = paddle.nn.BatchNorm(
                     6,
                     param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     use_global_stats=self.use_global_stats,
                     trainable_statistics=self.trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index 5bf239b5bc77df819a8c2a4fc876b13f8e3ff38b..646466e9504d4291e59798d955171e51e28fad9f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -322,12 +322,12 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
         label_np = np.random.randint(2, size=(2, 1)).astype('int64')
         weight_attr1 = paddle.ParamAttr(
             name="weight1",
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
             trainable=True,
         )
         weight_attr2 = paddle.ParamAttr(
             name="weight2",
-            initializer=fluid.initializer.Constant(value=2.0),
+            initializer=paddle.nn.initializer.Constant(value=2.0),
             trainable=True,
         )
         clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
index 69769bbdc1f08e36b0b82a931f15f277c667c209..5369f4d410bda2dc64b093b753452279ca7c8e70 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
@@ -298,7 +298,7 @@ class TestRunProgramOpWithFC(RunProgramNPUOpTest):
         weight_attr = fluid.ParamAttr(
             name=self.input_names['Params'][0],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][0]]
             ),
             trainable=True,
@@ -306,7 +306,7 @@ class TestRunProgramOpWithFC(RunProgramNPUOpTest):
         bias_attr = fluid.ParamAttr(
             name=self.input_names['Params'][1],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][1]]
             ),
             trainable=True,
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index a7cb066db7a1d56278b2a0a13fdc0b367dbeb726..c6a76c55635dd93bf4bc5a56833fbc4faa2c4912 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -35,6 +35,7 @@ class XPUOpTest(OpTest):
         '''Fix random seeds to remove randomness from tests'''
         cls.use_xpu = True
         cls.use_mkldnn = False
+        cls.epsilon_xpu2xpu = 0.00000001
         super().setUpClass()
 
     @classmethod
@@ -212,7 +213,11 @@ class XPUOpTest(OpTest):
             user_defined_grad_outputs=user_defined_grad_outputs,
         )
         self._assert_is_close(
-            a1, a2, inputs_to_check, 0.00000001, "Gradient Check On two xpu"
+            a1,
+            a2,
+            inputs_to_check,
+            self.epsilon_xpu2xpu,
+            "Gradient Check On two xpu",
         )
         self._assert_is_close(
             a1,
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
index 11aacd02439e99fe132171a2012385eb08ba092a..96b0b734a174c208ce5b2760c7d90102cbec825f 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -41,7 +41,7 @@ class SimpleNet(fluid.Layer):
             self.hidden_size,
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 )
             ),
@@ -50,7 +50,7 @@ class SimpleNet(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -58,7 +58,7 @@ class SimpleNet(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/prim/CMakeLists.txt b/python/paddle/fluid/tests/unittests/prim/CMakeLists.txt
index ab3ee7ba1a3ce577786bed4ea3911d3ba0de94eb..7fd5f5ecebfe2360139960e8f917cec8f1121943 100644
--- a/python/paddle/fluid/tests/unittests/prim/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/prim/CMakeLists.txt
@@ -9,3 +9,4 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach()
 
 add_subdirectory(prim)
+add_subdirectory(model)
diff --git a/python/paddle/fluid/tests/unittests/prim/model/CMakeLists.txt b/python/paddle/fluid/tests/unittests/prim/model/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..977f608929b6db23c8b26bdf1c52e6f0ba543ab8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/model/CMakeLists.txt
@@ -0,0 +1,15 @@
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
+endforeach()
+
+set_tests_properties(test_resnet_prim_cinn PROPERTIES TIMEOUT 300)
+
+if(WITH_CINN)
+  set_tests_properties(test_resnet_prim_cinn PROPERTIES LABELS "RUN_TYPE=CINN")
+endif()
diff --git a/python/paddle/fluid/tests/unittests/prim/model/test_comp_model_simple_net.py b/python/paddle/fluid/tests/unittests/prim/model/test_comp_model_simple_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..27b300e9afd44a5ee3b7dccf623ac6a4d329045e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/model/test_comp_model_simple_net.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterized as param
+
+import paddle
+from paddle.fluid import core, framework
+
+
+@param.parameterized_class(
+    ('name', 'primals', 'stop_gradients', 'cotangents', 'dtype'),
+    (
+        (
+            'test_normal_case',
+            (np.random.rand(2, 3, 4), np.random.rand(2, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_broadcast_diff_rank',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(3, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_broadcast_same_rank',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)),
+            (False, False),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+        (
+            'test_stop_gradient',
+            (np.random.rand(2, 3, 1, 4), np.random.rand(2, 1, 3, 4)),
+            (False, True),
+            (np.random.rand(2, 3, 3, 4),),
+            np.float32,
+        ),
+    ),
+)
+class TestMultiplyGradComp(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.primals = tuple(primal.astype(cls.dtype) for primal in cls.primals)
+        cls.cotangents = tuple(co.astype(cls.dtype) for co in cls.cotangents)
+
+    def setUp(self):
+        paddle.enable_static()
+
+    def tearDown(self):
+        paddle.disable_static()
+
+    def as_tuple(self, x):
+        return (x,) if isinstance(x, framework.Variable) else x
+
+    def net(self):
+        primals, cotangents = self.primals, self.cotangents
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            primals = tuple(
+                paddle.static.data(f'primal{i}', primal.shape, primal.dtype)
+                for i, primal in enumerate(primals)
+            )
+            for primal, flag in zip(primals, self.stop_gradients):
+                primal.stop_gradient = flag
+            cotangents = tuple(
+                paddle.static.data(f'cotangent{i}', co.shape, co.dtype)
+                for i, co in enumerate(cotangents)
+            )
+            out = self.as_tuple(paddle.tanh(paddle.multiply(*primals)))
+            grads = paddle.static.gradients(out, primals)
+        exe = paddle.static.Executor()
+        exe.run(sp)
+        return exe.run(
+            program=mp,
+            feed={
+                **{
+                    f'primal{i}': primal
+                    for i, primal in enumerate(self.primals)
+                },
+                **{f'cotangent{i}': co for i, co in enumerate(self.cotangents)},
+            },
+            fetch_list=[g for g in grads if g is not None],
+        )
+
+    def test_comp(self):
+        core._set_prim_backward_enabled(True)
+        actual = self.net()
+
+        core._set_prim_backward_enabled(False)
+        desired = self.net()
+
+        self.assertEqual(len(actual), len(desired))
+        for i, j in zip(actual, desired):
+            np.testing.assert_allclose(
+                i,
+                j,
+                rtol=1e-6,
+                atol=0,
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/model/test_resnet_prim_cinn.py b/python/paddle/fluid/tests/unittests/prim/model/test_resnet_prim_cinn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ed35725077c946640e97aea0b2f6cf2faab4044
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/model/test_resnet_prim_cinn.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import platform
+import time
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.vision.models import resnet50
+
+SEED = 2020
+base_lr = 0.001
+momentum_rate = 0.9
+l2_decay = 1e-4
+batch_size = 2
+epoch_num = 1
+
+if core.is_compiled_with_cuda():
+    paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+
+
+def reader_decorator(reader):
+    def __reader__():
+        for item in reader():
+            img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
+            label = np.array(item[1]).astype('int64').reshape(1)
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(parameter_list=None):
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=base_lr,
+        momentum=momentum_rate,
+        regularization=fluid.regularizer.L2Decay(l2_decay),
+        parameter_list=parameter_list,
+    )
+
+    return optimizer
+
+
+def train(to_static, enable_prim, enable_cinn):
+    if core.is_compiled_with_cuda():
+        paddle.set_device('gpu')
+    else:
+        paddle.set_device('cpu')
+    np.random.seed(SEED)
+    paddle.seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
+    fluid.core._set_prim_all_enabled(
+        enable_prim and platform.system() == 'Linux'
+    )
+
+    train_reader = paddle.batch(
+        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+        batch_size=batch_size,
+        drop_last=True,
+    )
+    data_loader = fluid.io.DataLoader.from_generator(capacity=5, iterable=True)
+    data_loader.set_sample_list_generator(train_reader)
+
+    resnet = resnet50(False)
+    if to_static:
+        build_strategy = paddle.static.BuildStrategy()
+        if enable_cinn:
+            build_strategy.build_cinn_pass = True
+        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
+    optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+    for epoch in range(epoch_num):
+        total_acc1 = 0.0
+        total_acc5 = 0.0
+        total_sample = 0
+        losses = []
+
+        for batch_id, data in enumerate(data_loader()):
+            start_time = time.time()
+            img, label = data
+
+            pred = resnet(img)
+            avg_loss = paddle.nn.functional.cross_entropy(
+                input=pred,
+                label=label,
+                soft_label=False,
+                reduction='mean',
+                use_softmax=True,
+            )
+
+            acc_top1 = paddle.static.accuracy(input=pred, label=label, k=1)
+            acc_top5 = paddle.static.accuracy(input=pred, label=label, k=5)
+
+            avg_loss.backward()
+            optimizer.minimize(avg_loss)
+            resnet.clear_gradients()
+
+            total_acc1 += acc_top1
+            total_acc5 += acc_top5
+            total_sample += 1
+            losses.append(avg_loss.numpy())
+
+            end_time = time.time()
+            print(
+                "epoch %d | batch step %d, loss %0.8f, acc1 %0.3f, acc5 %0.3f, time %f"
+                % (
+                    epoch,
+                    batch_id,
+                    avg_loss,
+                    total_acc1.numpy() / total_sample,
+                    total_acc5.numpy() / total_sample,
+                    end_time - start_time,
+                )
+            )
+            if batch_id == 10:
+                # avoid dataloader throw abort signaal
+                data_loader._reset()
+                break
+
+    return losses
+
+
+class TestResnet(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.dy2st = train(to_static=True, enable_prim=False, enable_cinn=False)
+
+    def test_prim(self):
+        dy2st_prim = train(to_static=True, enable_prim=True, enable_cinn=False)
+        # NOTE: Now dy2st is equal to dy2st_prim. With the splitting of kernels, the threshold here may need to be adjusted
+        np.testing.assert_allclose(self.dy2st, dy2st_prim, rtol=1e-6)
+
+    @unittest.skipIf(
+        not paddle.is_compiled_with_cinn(), "padle is not compiled with CINN"
+    )
+    def test_cinn(self):
+        dy2st_cinn = train(to_static=True, enable_prim=False, enable_cinn=True)
+        # TODO(0x45f): The following is only temporary thresholds, and the final thresholds needs to be discussed
+        np.testing.assert_allclose(self.dy2st[0:2], dy2st_cinn[0:2], rtol=1e-3)
+        np.testing.assert_allclose(self.dy2st, dy2st_cinn, rtol=1e-1)
+
+    @unittest.skipIf(
+        not paddle.is_compiled_with_cinn(), "padle is not compiled with CINN"
+    )
+    def test_prim_cinn(self):
+        dy2st_prim_cinn = train(
+            to_static=True, enable_prim=True, enable_cinn=True
+        )
+        # TODO(0x45f): The following is only temporary thresholds, and the final thresholds need to be discussed
+        np.testing.assert_allclose(
+            self.dy2st[0:2], dy2st_prim_cinn[0:2], rtol=1e-2
+        )
+        np.testing.assert_allclose(self.dy2st, dy2st_prim_cinn, rtol=1e-1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/flags/test_prim_flags.py b/python/paddle/fluid/tests/unittests/prim/prim/flags/test_prim_flags.py
index 8f3053af919e926c274a0f5d5bfba7d75ed12805..ef6c2951ff7dc8054a70313d9339986acbc7cb71 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/flags/test_prim_flags.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/flags/test_prim_flags.py
@@ -47,6 +47,16 @@ class TestPrimFlags(unittest.TestCase):
         core.check_and_set_prim_all_enabled()
         self.assertFalse(core._is_fwd_prim_enabled())
 
+        del os.environ['FLAGS_prim_backward']
+        core.check_and_set_prim_all_enabled()
+        self.assertFalse(core._is_bwd_prim_enabled())
+        del os.environ['FLAGS_prim_forward']
+        core.check_and_set_prim_all_enabled()
+        self.assertFalse(core._is_fwd_prim_enabled())
+
+        with self.assertRaises(TypeError):
+            core._test_use_sync("aaaa")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
index 1673ff083e7cf4081b300ab7de6e585e7b7d1c21..50ef9f6f13036ac114b2c9e6a7e4be7bb82b24e6 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
@@ -91,7 +91,7 @@ class TestAddGradComp(unittest.TestCase):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
index 5dd7417130bc1137b751ea420384d63350c216b0..b037cc73bfd545c9e21862d183d9f2759333d0b5 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
@@ -92,7 +92,7 @@ class TestDivGradComp(unittest.TestCase):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
index 95d3c3027fd9d28e4b054806959c8ad8ec391e9a..606b55b5a95c06fc097c03e0fe8964e38faadce6 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
@@ -91,7 +91,7 @@ class TestDivGradComp(unittest.TestCase):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py
index 2d1a10a6d4b5794d938b1685f03057ee9b63ca89..c2f15b6ab84a8f99843d16eec58f8236c9483b4b 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_multiply_grad.py
@@ -56,7 +56,7 @@ from paddle.fluid import core, framework
             'test_reduce_axe_empty',
             (np.random.rand(2, 3, 3, 4), np.random.rand(2, 1, 3, 4)),
             (False, False),
-            (np.random.rand(2, 1, 3, 1),),
+            (np.random.rand(2, 3, 3, 4),),
             np.float32,
         ),
     ),
@@ -91,7 +91,7 @@ class TestMultiplyGradComp(unittest.TestCase):
                 for i, co in enumerate(cotangents)
             )
             out = self.as_tuple(paddle.multiply(*primals))
-            grads = paddle.static.gradients(out, primals)
+            grads = paddle.static.gradients(out, primals, cotangents)
         exe = paddle.static.Executor()
         exe.run(sp)
         return exe.run(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
index 8df50c768c2b72e11d5de955b3b88e65183c0aad..8e623100dd09cb86b7aae0562035536944598b9f 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
@@ -70,7 +70,7 @@ class TestSqrtGradComp(unittest.TestCase):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
index 693bf8b942bab23e9af6b10c5456b8e76936d38b..3245d118760b2ba3596af964428bafc0620badcf 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
@@ -92,7 +92,7 @@ class TestDivGradComp(unittest.TestCase):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
index e643cf620a8118fb1c36202ada5543b60b3f0012..d28f84a685b0d0d83d085f1f76edfe003b6bd2fb 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
@@ -70,7 +70,7 @@ class TestTanhGradComp(unittest.TestCase):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/test_comp_get_grad_op_desc_prim_enabled.py b/python/paddle/fluid/tests/unittests/prim/test_comp_get_grad_op_desc_prim_enabled.py
index 18b445f38da3a8ea74f671b52ac74d33624be54a..c576be20388fb1b0dfec96d2753817041acd77a0 100644
--- a/python/paddle/fluid/tests/unittests/prim/test_comp_get_grad_op_desc_prim_enabled.py
+++ b/python/paddle/fluid/tests/unittests/prim/test_comp_get_grad_op_desc_prim_enabled.py
@@ -75,7 +75,6 @@ class TestGetGradOpDescPrimEnabled(unittest.TestCase):
                 self.fwd, self.no_grad_var, self.grad_sub_block
             )[0]
         )
-        print(actual)
         self.assertEquals(actual, self.desired_ops)
         core._set_prim_backward_enabled(False)
 
diff --git a/python/paddle/fluid/tests/unittests/prim/test_comp_skip_op_set.py b/python/paddle/fluid/tests/unittests/prim/test_comp_skip_op_set.py
new file mode 100644
index 0000000000000000000000000000000000000000..15648226e7859a4be83d525b3efbd5e5d55e0bc7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/test_comp_skip_op_set.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import paddle
+from paddle.fluid import core, framework
+
+
+class TestGetGradOpDescPrimEnabled(unittest.TestCase):
+    def setUp(self):
+        self.fwd_type = 'tanh'
+        self.inputs = {'X': ['x']}
+        self.outputs = {'Out': ['y']}
+        self.no_grad_var = set()
+        self.grad_sub_block = tuple()
+        self.desired_ops = 'tanh_grad'
+        self.desired_ops_no_skip = ('pow', 'scale', 'elementwise_mul')
+        paddle.enable_static()
+        block = framework.Block(framework.Program(), 0)
+        block.append_op(
+            type=self.fwd_type,
+            inputs={
+                n: [block.create_var(name=v, stop_gradient=False) for v in vs]
+                for n, vs in self.inputs.items()
+            },
+            outputs={
+                n: [block.create_var(name=v, stop_gradient=False) for v in vs]
+                for n, vs in self.outputs.items()
+            },
+        )
+        self.fwd = block.ops[0].desc
+
+    def tearDown(self):
+        paddle.disable_static()
+
+    def test_get_grad_op_desc_without_skip(self):
+        core._set_prim_backward_enabled(True)
+        actual = tuple(
+            desc.type()
+            for desc in core.get_grad_op_desc(
+                self.fwd, self.no_grad_var, self.grad_sub_block
+            )[0]
+        )
+        self.assertEquals(actual, self.desired_ops_no_skip)
+        core._set_prim_backward_enabled(False)
+
+    def test_get_grad_op_desc_with_skip(self):
+        core._set_prim_backward_enabled(True)
+        core._add_skip_comp_ops("tanh")
+        actual = tuple(
+            desc.type()
+            for desc in core.get_grad_op_desc(
+                self.fwd, self.no_grad_var, self.grad_sub_block
+            )[0]
+        )
+        core._remove_skip_comp_ops("tanh")
+        self.assertEquals(actual[0], self.desired_ops)
+        core._set_prim_backward_enabled(False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
index cb566a41aaaab0fb15f6214dd90e87d01be8cb33..ce130243fefae98d2c4e3c7f3710cecd962509eb 100755
--- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
@@ -9,3 +9,7 @@ foreach(TEST_OP ${TEST_OPS})
   list(APPEND TEST_OPS ${TEST_OP})
   set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
 endforeach()
+
+if(WITH_PSCORE)
+  set_tests_properties(test_gpubox_ps PROPERTIES LABELS "RUN_TYPE=GPUPS")
+endif()
diff --git a/python/paddle/fluid/tests/unittests/ps/config_gpubox.yaml b/python/paddle/fluid/tests/unittests/ps/config_gpubox.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..a09de08d572a45d6aa7da303b2a9117c6a67c960
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/config_gpubox.yaml
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workspace
+#workspace: "models/rank/dnn"
+
+
+runner:
+  train_data_dir: "data/sample_data/train"
+  train_reader_path: "criteo_reader" # importlib format
+  use_gpu: True
+  use_auc: False
+  train_batch_size: 32
+  epochs: 3
+  print_interval: 10
+  model_save_path: "output_model_dnn_queue"
+
+  sync_mode: "gpubox"
+  thread_num: 30
+  reader_type: "InmemoryDataset"  # DataLoader / QueueDataset / RecDataset / InmemoryDataset
+  pipe_command: "python3.7 dataset_generator_criteo.py"
+  dataset_debug: False
+  split_file_list: False
+
+  infer_batch_size: 2
+  infer_reader_path: "criteo_reader" # importlib format
+  test_data_dir: "data/sample_data/train"
+  infer_load_path: "output_model_dnn_queue"
+  infer_start_epoch: 0
+  infer_end_epoch: 3
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1024
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+  distributed_embedding: 0
diff --git a/python/paddle/fluid/tests/unittests/ps/dataset_generator_criteo.py b/python/paddle/fluid/tests/unittests/ps/dataset_generator_criteo.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b095c80262052f4013eb7f8cf0dc00569a4146a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/dataset_generator_criteo.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import paddle.distributed.fleet as fleet
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+
+
+class Reader(fleet.MultiSlotDataGenerator):
+    def init(self):
+        padding = 0
+        sparse_slots = "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
+        self.sparse_slots = sparse_slots.strip().split(" ")
+        self.dense_slots = ["dense_feature"]
+        self.dense_slots_shape = [13]
+        self.slots = self.sparse_slots + self.dense_slots
+        self.slot2index = {}
+        self.visit = {}
+        for i in range(len(self.slots)):
+            self.slot2index[self.slots[i]] = i
+            self.visit[self.slots[i]] = False
+        self.padding = padding
+        logger.info("pipe init success")
+
+    def line_process(self, line):
+        line = line.strip().split(" ")
+        output = [(i, []) for i in self.slots]
+        for i in line:
+            slot_feasign = i.split(":")
+            slot = slot_feasign[0]
+            if slot not in self.slots:
+                continue
+            if slot in self.sparse_slots:
+                feasign = int(slot_feasign[1])
+            else:
+                feasign = float(slot_feasign[1])
+            output[self.slot2index[slot]][1].append(feasign)
+            self.visit[slot] = True
+        for i in self.visit:
+            slot = i
+            if not self.visit[slot]:
+                if i in self.dense_slots:
+                    output[self.slot2index[i]][1].extend(
+                        [self.padding]
+                        * self.dense_slots_shape[self.slot2index[i]]
+                    )
+                else:
+                    output[self.slot2index[i]][1].extend([self.padding])
+            else:
+                self.visit[slot] = False
+
+        return output
+        # return [label] + sparse_feature + [dense_feature]
+
+    def generate_sample(self, line):
+        r"Dataset Generator"
+
+        def reader():
+            output_dict = self.line_process(line)
+            # {key, value} dict format: {'labels': [1], 'sparse_slot1': [2, 3], 'sparse_slot2': [4, 5, 6, 8], 'dense_slot': [1,2,3,4]}
+            # dict must match static_model.create_feed()
+            yield output_dict
+
+        return reader
+
+
+if __name__ == "__main__":
+
+    r = Reader()
+    r.init()
+    r.run_from_stdin()
diff --git a/python/paddle/fluid/tests/unittests/ps/download_criteo_data.sh b/python/paddle/fluid/tests/unittests/ps/download_criteo_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..69bfd90bee0507f03b12640f213a11d8390fd25a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/download_criteo_data.sh
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+wget --no-check-certificate https://paddlerec.bj.bcebos.com/benchmark/sample_train.txt
+mkdir train_data
+mv sample_train.txt train_data/
diff --git a/python/paddle/fluid/tests/unittests/ps/gpubox_run.sh b/python/paddle/fluid/tests/unittests/ps/gpubox_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d3cfcf38aa4c00e677967c7cfed7495083e1874b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/gpubox_run.sh
@@ -0,0 +1,60 @@
+# !/bin/bash
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ ! -d "./log" ]; then
+  mkdir ./log
+  echo "Create log floder for store running log"
+fi
+
+export FLAGS_LAUNCH_BARRIER=0
+export PADDLE_TRAINER_ID=0
+export PADDLE_PSERVER_NUMS=1
+export PADDLE_TRAINERS=1
+export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS}
+export POD_IP=127.0.0.1
+
+# set free port if 29011 is occupied
+export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:29011"
+export PADDLE_PSERVER_PORT_ARRAY=(29011)
+
+# set gpu numbers according to your device
+export FLAGS_selected_gpus="0,1,2,3,4,5,6,7"
+#export FLAGS_selected_gpus="0,1"
+
+# set your model yaml
+#SC="gpubox_ps_trainer.py"
+SC="static_gpubox_trainer.py"
+
+# run pserver
+export TRAINING_ROLE=PSERVER
+for((i=0;i<$PADDLE_PSERVER_NUMS;i++))
+do
+    cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]}
+    echo "PADDLE WILL START PSERVER "$cur_port
+    export PADDLE_PORT=${cur_port}
+    python3.7 -u $SC &> ./log/pserver.$i.log &
+done
+
+# run trainer
+export TRAINING_ROLE=TRAINER
+for((i=0;i<$PADDLE_TRAINERS;i++))
+do
+    echo "PADDLE WILL START Trainer "$i
+    export PADDLE_TRAINER_ID=$i
+    python3.7 -u $SC &> ./log/worker.$i.log
+done
+
+echo "Training log stored in ./log/"
diff --git a/python/paddle/fluid/tests/unittests/ps/static_gpubox_trainer.py b/python/paddle/fluid/tests/unittests/ps/static_gpubox_trainer.py
new file mode 100755
index 0000000000000000000000000000000000000000..13555241de7255a3354ba6743bb5fd597e40589f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/static_gpubox_trainer.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import logging
+import os
+import sys
+import time
+
+import paddle
+import paddle.distributed.fleet as fleet
+from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
+
+fleet_util = FleetUtil()
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+
+
+def get_dataset(inputs, config):
+    dataset = paddle.distributed.InMemoryDataset()
+    dataset._set_use_ps_gpu(config.get('runner.use_gpu'))
+    pipe_cmd = config.get('runner.pipe_command')
+    dataset.init(
+        use_var=inputs,
+        pipe_command=pipe_cmd,
+        batch_size=32,
+        thread_num=int(config.get('runner.thread_num')),
+        fs_name=config.get("runner.fs_name", ""),
+        fs_ugi=config.get("runner.fs_ugi", ""),
+    )
+    dataset.set_filelist(["train_data/sample_train.txt"])
+    dataset.update_settings(
+        parse_ins_id=config.get("runner.parse_ins_id", False),
+        parse_content=config.get("runner.parse_content", False),
+    )
+
+    return dataset
+
+
+class Main(object):
+    def __init__(self):
+        self.metrics = {}
+        self.input_data = None
+        self.reader = None
+        self.exe = None
+        self.model = None
+        self.PSGPU = None
+        self.train_result_dict = {}
+        self.train_result_dict["speed"] = []
+        self.train_result_dict["auc"] = []
+
+    def run(self):
+        from ps_dnn_trainer import YamlHelper
+
+        yaml_helper = YamlHelper()
+        config_yaml_path = 'config_gpubox.yaml'
+        self.config = yaml_helper.load_yaml(config_yaml_path)
+
+        os.environ["CPU_NUM"] = str(self.config.get("runner.thread_num"))
+        fleet.init()
+        self.network()
+        if fleet.is_server():
+            self.run_server()
+        elif fleet.is_worker():
+            self.run_worker()
+            fleet.stop_worker()
+        logger.info("Run Success, Exit.")
+        logger.info("-" * 100)
+
+    def network(self):
+        from ps_dnn_trainer import StaticModel, get_user_defined_strategy
+
+        # self.model = get_model(self.config)
+        self.model = StaticModel(self.config)
+        self.input_data = self.model.create_feeds()
+        self.init_reader()
+        self.metrics = self.model.net(self.input_data)
+        self.inference_target_var = self.model.inference_target_var
+        logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
+        # self.model.create_optimizer(get_strategy(self.config)
+        user_defined_strategy = get_user_defined_strategy(self.config)
+        optimizer = paddle.optimizer.Adam(0.01, lazy_mode=True)
+        optimizer = fleet.distributed_optimizer(
+            optimizer, user_defined_strategy
+        )
+        optimizer.minimize(self.model._cost)
+        logger.info("end network.....")
+
+    def run_server(self):
+        logger.info("Run Server Begin")
+        fleet.init_server(self.config.get("runner.warmup_model_path"))
+        fleet.run_server()
+
+    def run_worker(self):
+        logger.info("Run Worker Begin")
+        use_cuda = int(self.config.get("runner.use_gpu"))
+        use_auc = self.config.get("runner.use_auc", False)
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        self.exe = paddle.static.Executor(place)
+        '''
+        with open("./{}_worker_main_program.prototxt".format(
+                fleet.worker_index()), 'w+') as f:
+            f.write(str(paddle.static.default_main_program()))
+        with open("./{}_worker_startup_program.prototxt".format(
+                fleet.worker_index()), 'w+') as f:
+            f.write(str(paddle.static.default_startup_program()))
+        '''
+        self.exe.run(paddle.static.default_startup_program())
+        fleet.init_worker()
+        '''
+        save_model_path = self.config.get("runner.model_save_path")
+        if save_model_path and (not os.path.exists(save_model_path)):
+            os.makedirs(save_model_path)
+        '''
+        reader_type = self.config.get("runner.reader_type", None)
+        epochs = int(self.config.get("runner.epochs"))
+        sync_mode = self.config.get("runner.sync_mode")
+
+        gpus_env = os.getenv("FLAGS_selected_gpus")
+        self.PSGPU = paddle.framework.core.PSGPU()
+        gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)]
+        gpu_mf_sizes = [self.model.sparse_feature_dim - 1] * (
+            self.model.sparse_inputs_slots - 1
+        )
+        self.PSGPU.set_slot_vector(gpuslot)
+        self.PSGPU.set_slot_dim_vector(gpu_mf_sizes)
+        self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")])
+        gpu_num = len(gpus_env.split(","))
+        opt_info = paddle.static.default_main_program()._fleet_opt
+        if use_auc is True:
+            opt_info['stat_var_names'] = [
+                self.model.stat_pos.name,
+                self.model.stat_neg.name,
+            ]
+        else:
+            opt_info['stat_var_names'] = []
+
+        for epoch in range(epochs):
+            epoch_start_time = time.time()
+
+            self.dataset_train_loop(epoch)
+
+            epoch_time = time.time() - epoch_start_time
+
+            self.PSGPU.end_pass()
+
+            fleet.barrier_worker()
+            self.reader.release_memory()
+            logger.info("finish {} epoch training....".format(epoch))
+        self.PSGPU.finalize()
+
+    def init_reader(self):
+        if fleet.is_server():
+            return
+        # self.reader, self.file_list = get_reader(self.input_data, config)
+        self.reader = get_dataset(self.input_data, self.config)
+
+    def dataset_train_loop(self, epoch):
+        start_time = time.time()
+        self.reader.load_into_memory()
+        print(
+            "self.reader.load_into_memory cost :{} seconds".format(
+                time.time() - start_time
+            )
+        )
+
+        begin_pass_time = time.time()
+        self.PSGPU.begin_pass()
+        print(
+            "begin_pass cost:{} seconds".format(time.time() - begin_pass_time)
+        )
+
+        logger.info("Epoch: {}, Running Dataset Begin.".format(epoch))
+        fetch_info = [
+            "Epoch {} Var {}".format(epoch, var_name)
+            for var_name in self.metrics
+        ]
+        fetch_vars = [var for _, var in self.metrics.items()]
+        print_step = int(self.config.get("runner.print_interval"))
+        self.exe.train_from_dataset(
+            program=paddle.static.default_main_program(),
+            dataset=self.reader,
+            debug=self.config.get("runner.dataset_debug"),
+        )
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    benchmark_main = Main()
+    benchmark_main.run()
diff --git a/python/paddle/fluid/tests/unittests/ps/test_gpubox_ps.py b/python/paddle/fluid/tests/unittests/ps/test_gpubox_ps.py
new file mode 100755
index 0000000000000000000000000000000000000000..8ebc6ba8a342414c8c123e2069ef68ca029de469
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/test_gpubox_ps.py
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shlex  # noqa: F401
+import unittest
+
+
+class GpuBoxTest(unittest.TestCase):
+    def test_gpubox(self):
+        exitcode = os.system('sh gpubox_run.sh')
+        os.system('rm *_train_desc.prototxt')
+        if os.path.exists('./train_data'):
+            os.system('rm -rf train_data')
+        if os.path.exists('./log'):
+            os.system('rm -rf log')
+
+
+if __name__ == '__main__':
+    if not os.path.exists('./train_data'):
+        os.system('sh download_criteo_data.sh')
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
index a3ff2b686574477e8f12132c97c1c9e0e538cf5b..cf4372818ba1ea817f1aa9f150c94f4d83c6fb0b 100644
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -26,7 +26,7 @@ def simple_fc_net_with_inputs(img, label, class_num=10):
             size=100,
             activation='relu',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(
@@ -53,7 +53,7 @@ def batchnorm_fc_with_inputs(img, label, class_num=10):
             size=200,
             activation='relu',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py
index 116aa60d052021e2328df4c63809f112e4e9e30f..4126f84ed1e8c73f409195e66168ce41973a3b91 100644
--- a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py
+++ b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py
@@ -50,10 +50,10 @@ class TestCustomStream(unittest.TestCase):
         ops = prog.global_block().ops
         for op_index in op_index_for_stream1:
             ops[op_index].dist_attr.execution_stream = "s1"
-            ops[op_index].dist_attr.stream_priority = -1
+            ops[op_index].dist_attr.stream_priority = 0
         for op_index in op_index_for_stream2:
             ops[op_index].dist_attr.execution_stream = "s2"
-            ops[op_index].dist_attr.stream_priority = -2
+            ops[op_index].dist_attr.stream_priority = -1
 
     def run_program(self, apply_custom_stream=False):
         paddle.seed(2022)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index d018c52506bff543a040bb298a7040da95f7c0d6..37048d7cd256b377ba8be08df3ac8658eb22884e 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -25,11 +25,9 @@ paddle.enable_static()
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index 89cd0453d747bf9d1d39160068a35d273eb37029..4fca47635a1deeb756263756e5c089d1ae10b0d6 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -30,11 +30,9 @@ OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
index 047bd3ae8ad274bfa77a9821ec07c967382aaf1a..9c863d6d3be8b04748f4ae1daf1a79fc327aeb29 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
@@ -25,11 +25,9 @@ paddle.enable_static()
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 431d8b24bcee2f7581c25a542a71fc9cb1a6340f..5627ead0a6b59e62a62bfb518842a50592453e9a 100755
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -60,7 +60,7 @@ class TestAccuracyOpFp16(TestAccuracyOp):
 
 
 class TestAccuracyOpError(unittest.TestCase):
-    def test_errors(self):
+    def test_type_errors(self):
         with program_guard(Program(), Program()):
             # The input type of accuracy_op must be Variable.
             x1 = fluid.create_lod_tensor(
@@ -75,12 +75,27 @@ class TestAccuracyOpError(unittest.TestCase):
             x2 = paddle.static.data(name='x2', shape=[-1, 4], dtype="int32")
             self.assertRaises(TypeError, paddle.static.accuracy, x2, label)
             self.assertRaises(TypeError, paddle.metric.accuracy, x2, label)
+
             x3 = paddle.static.data(
                 name='input', shape=[-1, 2], dtype="float16"
             )
             paddle.static.accuracy(input=x3, label=label)
             paddle.metric.accuracy(input=x3, label=label)
 
+    def test_value_errors(self):
+        with program_guard(Program(), Program()):
+            paddle.disable_static()
+
+            # The input rank of accuracy_op must be 2.
+            with self.assertRaises(ValueError):
+                x3 = paddle.to_tensor([0.1], dtype='float32')
+                label3 = paddle.to_tensor(
+                    np.reshape([0], [1, 1]), dtype='int32'
+                )
+                paddle.metric.accuracy(x3, label3)
+
+            paddle.enable_static()
+
 
 class TestAccuracyAPI1(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index d7052c94720a4f0f17da958679f1bb5a65479524..2983e5ca1958ef5b65aa280692017de3cc706802 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -767,12 +767,12 @@ class TestAdamOptimizer(unittest.TestCase):
         label_np = np.random.randint(2, size=(2, 1)).astype('int64')
         weight_attr1 = paddle.ParamAttr(
             name="weight1",
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
             trainable=True,
         )
         weight_attr2 = paddle.ParamAttr(
             name="weight2",
-            initializer=fluid.initializer.Constant(value=2.0),
+            initializer=paddle.nn.initializer.Constant(value=2.0),
             trainable=True,
         )
         clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
@@ -926,7 +926,7 @@ class TestAdamOptimizer(unittest.TestCase):
         main = fluid.Program()
         weight_attr = paddle.ParamAttr(
             name="weight1",
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
             regularizer=fluid.regularizer.L1DecayRegularizer(
                 regularization_coeff=0.1
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_add_n_op.py b/python/paddle/fluid/tests/unittests/test_add_n_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ca485b1419fd14e1d75b614e1d85c67fc5a159f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_add_n_op.py
@@ -0,0 +1,64 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestAddnOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(20)
+        l = 32
+        self.x_np = np.random.random([l, 16, 256])
+
+    def check_main(self, x_np, dtype, axis=None):
+        paddle.disable_static()
+        x = []
+        for i in range(x_np.shape[0]):
+            val = paddle.to_tensor(x_np[i].astype(dtype))
+            val.stop_gradient = False
+            x.append(val)
+        y = paddle.add_n(x)
+        x_g = paddle.grad(y, x)
+        y_np = y.numpy().astype('float32')
+        x_g_np = []
+        for val in x_g:
+            x_g_np.append(val.numpy().astype('float32'))
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_add_n_fp16(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        y_np_16, x_g_np_16 = self.check_main(self.x_np, 'float16')
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+
+        np.testing.assert_allclose(y_np_16, y_np_32, rtol=1e-03)
+        for i in range(len(x_g_np_32)):
+            np.testing.assert_allclose(x_g_np_16[i], x_g_np_32[i], rtol=1e-03)
+
+    def test_add_n_api(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+        y_np_gt = np.sum(self.x_np, axis=0).astype('float32')
+
+        np.testing.assert_allclose(y_np_32, y_np_gt, rtol=1e-06)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 25e4ab9aa8b4a48c9f4ab6349e5ba4bbb00cd67a..c15f647a380fe6580cda6d8b5a3ba35fb7b59dac 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -39,7 +39,6 @@ from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import Resharder
 from paddle.distributed.fleet import auto
 from paddle.fluid import core
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 if os.getenv("CUDA_VISIBLE_DEVICES") is not None:
     os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -373,10 +372,18 @@ class MLPLayer(nn.Layer):
         arr1 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model))
         arr2 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
         arr3 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model))
-        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
-        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
-        weight_attr2 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr2))
-        weight_attr3 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr3))
+        weight_attr0 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr0)
+        )
+        weight_attr1 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr1)
+        )
+        weight_attr2 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr2)
+        )
+        weight_attr3 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr3)
+        )
         bias_attr = None
         self.linear0 = nn.Linear(
             d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py b/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
index 77062eee5a376b891d82457df32dffa1af8894d4..efbf4a538e009cf194c721fc16614785931d91ea 100644
--- a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
+++ b/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 
 
@@ -22,7 +23,7 @@ class TestAvoidTwiceInitialization(unittest.TestCase):
         cur_program = fluid.Program()
         cur_block = cur_program.current_block()
         var = cur_block.create_parameter(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             shape=[2, 2],
             dtype='float32',
             name='var_a',
@@ -40,7 +41,7 @@ class TestAvoidTwiceInitialization(unittest.TestCase):
             attrs={'ring_id': 0},
         )
         var2 = cur_block.create_parameter(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             shape=[2, 2],
             dtype='float32',
             name='var_a',
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 05d9b71c1e4372dc7d7f870b433a1d17466c761d..7414c3732b18f13c0ee7f12b2bdb99e1cd8e03e1 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -26,7 +26,7 @@ class L1(fluid.Layer):
     def __init__(self):
         super().__init__()
         self._param_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.1)
+            initializer=paddle.nn.initializer.Constant(value=0.1)
         )
         self.w1 = self.create_parameter(
             attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index ece07889df4e97ee469a6694e1fdefd41beb9665..d6127ff5dd78a95ab18146c5954bd0366f3ff8d8 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -183,11 +183,11 @@ class TestBatchNorm(unittest.TestCase):
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.0),
+                            initializer=paddle.nn.initializer.Constant(1.0),
                             trainable=False,
                         ),
                         bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.0),
+                            initializer=paddle.nn.initializer.Constant(0.0),
                             trainable=False,
                         ),
                         trainable_statistics=trainable_statistics,
@@ -378,7 +378,7 @@ class TestBatchNormUseGlobalStats(unittest.TestCase):
                 net1 = paddle.nn.BatchNorm(
                     6,
                     param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     use_global_stats=self.use_global_stats,
                     trainable_statistics=self.trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index d8d20e41aac26133798a2c7b630203ac0de3d922..38a1284f0ae4a06646d6725d5fb72f6158d9cd65 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -48,7 +48,7 @@ class TestDoubleGrad(unittest.TestCase):
                 name='x',
                 shape=[1],
                 dtype='float32',
-                default_initializer=fluid.initializer.Constant(3),
+                default_initializer=paddle.nn.initializer.Constant(3),
             )
             (grad1,) = fluid.gradients(net(x), x)  # 2x = 6
             z = net(x - grad1)
@@ -69,7 +69,7 @@ class TestDoubleGrad(unittest.TestCase):
                 name='x',
                 shape=[1],
                 dtype='float32',
-                default_initializer=fluid.initializer.Constant(1),
+                default_initializer=paddle.nn.initializer.Constant(1),
             )
             y = x * x
             (dx1,) = fluid.gradients(y, x)
diff --git a/python/paddle/fluid/tests/unittests/test_clip_grad_norm_.py b/python/paddle/fluid/tests/unittests/test_clip_grad_norm_.py
new file mode 100644
index 0000000000000000000000000000000000000000..308c59d094ec51ba6838f04cf6c1f8e4bcafb745
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_clip_grad_norm_.py
@@ -0,0 +1,121 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.nn.utils.clip_grad_norm_ import clip_grad_norm_
+
+
+class TestClipGradNorm(unittest.TestCase):
+    def test_basic(self):
+        run_test_equal(
+            self,
+            shape=[16, 16],
+            dtype=np.float32,
+            max_norm=5,
+            norm_type=2,
+        )
+        run_test_equal(
+            self,
+            shape=(100,),
+            dtype=np.float32,
+            max_norm=1e20,
+            norm_type=2,
+        )
+        run_test_equal(
+            self,
+            shape=[4, 8, 16],
+            dtype=np.float32,
+            max_norm=1.0,
+            norm_type=float("inf"),
+        )
+
+    def test_errors(self):
+        def TestValueError():
+            input_pd = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            input_pd.grad = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            clip_grad_norm_(input_pd, max_norm=2, norm_type=float("-inf"))
+
+        self.assertRaises(ValueError, TestValueError)
+
+        def TestRuntimeError():
+            input_pd = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            input_pd.grad = paddle.full([1, 2], float("inf"))
+            clip_grad_norm_(
+                input_pd, max_norm=2, norm_type=2, error_if_nonfinite=True
+            )
+
+        self.assertRaises(RuntimeError, TestRuntimeError)
+
+        def TestRuntimeErrorStaticMode():
+            paddle.enable_static()
+            input_pd = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            input_pd.grad = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            clip_grad_norm_(input_pd, max_norm=2, norm_type=float("inf"))
+            paddle.disable_static()
+
+        self.assertRaises(RuntimeError, TestRuntimeErrorStaticMode)
+
+
+def run_test_equal(
+    self,
+    shape,
+    dtype,
+    max_norm,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+):
+    input = np.random.random(shape).astype(dtype)
+    grad = np.random.random(shape).astype(dtype)
+    input_pd = paddle.to_tensor(input)
+    input_pd.grad = paddle.to_tensor(grad)
+
+    if norm_type == 2:
+        grad = grad.reshape(1, grad.size)
+        output = np.linalg.norm(grad, 'fro')
+    elif norm_type == np.inf:
+        output = np.amax(np.abs(grad))
+    else:
+        output = np.linalg.norm(grad, norm_type)
+    clip_grad_norm_result = clip_grad_norm_(
+        input_pd,
+        max_norm=max_norm,
+        norm_type=norm_type,
+        error_if_nonfinite=error_if_nonfinite,
+    )
+
+    np.testing.assert_allclose(
+        clip_grad_norm_result.numpy(),
+        output,
+        rtol=1e-05,
+        atol=1e-05,
+        equal_nan=False,
+    )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 0e836dca1c2e515d95b9ff6c94f4383203779a5f..a82c0e023c6c07dab4186bb11c54a0885550af54 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -41,7 +41,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
             size=[10000, 10],
             param_attr=fluid.ParamAttr(
                 name="embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index edd3d718c437e33b31b3942b1664c41043c2fe87..7cb0a066141db17b85c218924510a9272808b85d 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -18,7 +18,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -110,11 +109,11 @@ class Conv2DTestCase(unittest.TestCase):
                     else (-1, self.num_channels, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
                 if self.padding_mode != 'zeros':
                     x_var = F.pad(
                         x_var,
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index 9eeb4fc82dfb149ed2424bb068d39d0a23339dce..50c80c3aa32d61bf61f588ea20c3f990ae98de99 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -18,7 +18,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -101,11 +100,11 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     else (-1, self.num_channels, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
 
                 y_var = paddle.static.nn.conv2d_transpose(
                     x_var,
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 89339303567f2a50d67475a6890a0a3d2578f75d..e2c80404a5db597d873ab46cbbe37a957dd7f995 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -1013,6 +1013,17 @@ class TestConv2DTransposeRepr(unittest.TestCase):
         paddle.enable_static()
 
 
+class TestConv2dTranspose(unittest.TestCase):
+    def error_weight_input(self):
+        array = np.array([1], dtype=np.float32)
+        x = paddle.to_tensor(np.reshape(array, [1, 1, 1, 1]), dtype='float32')
+        weight = paddle.to_tensor(np.reshape(array, [1]), dtype='float32')
+        paddle.nn.functional.conv2d_transpose(x, weight, bias=0)
+
+    def test_type_error(self):
+        self.assertRaises(ValueError, self.error_weight_input)
+
+
 class TestTensorOutputSize1(UnittestBase):
     def init_info(self):
         self.shapes = [[2, 3, 8, 8]]
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index 78276fbf76db1fbfa567d0c59259b17e6b14fed1..8ef86daf69a034a815f9b03df5338d914c47f548 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -18,7 +18,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -97,11 +96,11 @@ class Conv3DTestCase(unittest.TestCase):
                     else (-1, self.num_channels, -1, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
                 y_var = paddle.static.nn.conv3d(
                     x_var,
                     self.num_filters,
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index 1ea071142c6c733f5bf3a557ee8a1e0d48ccbc48..82c08348f4bf1edffd4837b7edac13c3d14ec524 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -18,7 +18,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -99,11 +98,11 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     else (-1, self.num_channels, -1, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
                 y_var = paddle.static.nn.conv3d_transpose(
                     x_var,
                     self.num_filters,
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index bf223b8d59fa58a68bea38318285b9d67b946747..aac41167399d5afb76d460b109b6e43bffe714ac 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -555,5 +555,18 @@ class TestCUDNNWithGroups_NHWC(TestWithGroups):
         self.op_type = "conv3d_transpose"
 
 
+class TestConv3dTranspose(unittest.TestCase):
+    def error_weight_input(self):
+        array = np.array([1], dtype=np.float32)
+        x = paddle.to_tensor(
+            np.reshape(array, [1, 1, 1, 1, 1]), dtype='float32'
+        )
+        weight = paddle.to_tensor(np.reshape(array, [1]), dtype='float32')
+        paddle.nn.functional.conv3d_transpose(x, weight, bias=0)
+
+    def test_type_error(self):
+        self.assertRaises(ValueError, self.error_weight_input)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 1050fb0ad5c57399cc2ed391f00dd81e538f2e9f..f5886edc3350ceca33f8084f0a9d4a740e88a376 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -149,6 +149,13 @@ class TestCropNoneShape(unittest.TestCase):
         self.assertEqual(crop.shape, (3, 6, 6))
 
 
+class TestCropError(unittest.TestCase):
+    def test_neg_offset_error(self):
+        with self.assertRaises(ValueError):
+            x = fluid.data(name='input2', shape=[1], dtype="float32")
+            out = paddle.crop(x, offsets=[-1])
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 32f77ab290b88c87012eeea957f0433ac25012b1..b7a0c981bacd6741832b051d703efa3b916701e6 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -120,15 +120,15 @@ class TestGeneratorSeed(unittest.TestCase):
             result_1 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
             result_2 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index e02282cb9bee18d2ef0db016bc39f0e77cd3e717..a5f193daa4c50d3399ea96bae32cf8339096db02 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -61,7 +61,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
                     size=hidden_size,
                     activation='tanh',
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=1.0)
+                        initializer=paddle.nn.initializer.Constant(value=1.0)
                     ),
                 )
 
diff --git a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
index 7a5093f872ceda2e0e88095d873ab1a2df763ef4..d484e140b6e1dbec20ac53f2cfda9712adb7dc78 100644
--- a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
@@ -726,5 +726,23 @@ class TestDeformConv2DFunctionalWithGroups(TestDeformConv2DFunctional):
         self.no_bias = False
 
 
+class TestDeformConv2DError(unittest.TestCase):
+    def test_input_error(self):
+        def test_input_rank_error():
+            paddle.enable_static()
+            x = paddle.static.data(name='error_x_1', shape=[0], dtype='float32')
+            offset = paddle.static.data(
+                name='error_offset_1', shape=[0], dtype='float32'
+            )
+            mask = paddle.static.data(
+                name='error_mask_1', shape=[0, 0, 0], dtype='float32'
+            )
+            out = paddle.static.nn.deform_conv2d(
+                x, offset, mask, 0, 0, deformable_groups=0
+            )
+
+        self.assertRaises(ValueError, test_input_rank_error)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index ecb49c3172fb92a9bccb420030a51febe44b48e7..2bf68add10281c24b25e5564ff5e3bc6ae2ee311 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -58,9 +58,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale
-            )
+            initializer=paddle.nn.initializer.Normal(loc=0.0, scale=scale)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/test_detach.py b/python/paddle/fluid/tests/unittests/test_detach.py
index 1a01b7667feb1d3c8cf12b03fb1aef96194b10e3..d3622bd042de99b2aef74ce3c158e38aa26600c8 100644
--- a/python/paddle/fluid/tests/unittests/test_detach.py
+++ b/python/paddle/fluid/tests/unittests/test_detach.py
@@ -120,7 +120,7 @@ class Test_Detach(unittest.TestCase):
                 initializer=paddle.nn.initializer.Constant(5.0)
             )
             linear_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(6.0)
+                initializer=paddle.nn.initializer.Constant(6.0)
             )
             linear = Linear(
                 4,
@@ -132,7 +132,7 @@ class Test_Detach(unittest.TestCase):
                 initializer=paddle.nn.initializer.Constant(7.0)
             )
             linear1_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(8.0)
+                initializer=paddle.nn.initializer.Constant(8.0)
             )
             linear1 = Linear(
                 10,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index c560dfa8dbb0b0d99bebcbb3f2098faa76e19ba6..f0f85e1645124226f782d4315893a2f6529cc2d6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -58,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
             size=[1000000000, 100000],
             param_attr=paddle.fluid.ParamAttr(
                 name="embedding",
-                initializer=paddle.fluid.initializer.Constant(value=0.01),
+                initializer=paddle.paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index bc17b0d67f9908185921b996d5a793680044baeb..69b341a026762043028d082f337893bf1c60bc52 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -89,7 +89,7 @@ class TestDistFleetHeterProgram(unittest.TestCase):
                 size=[100001, 10],
                 param_attr=fluid.ParamAttr(
                     name="SparseFeatFactors",
-                    initializer=fluid.initializer.Uniform(),
+                    initializer=paddle.nn.initializer.Uniform(),
                 ),
             )
 
@@ -103,8 +103,8 @@ class TestDistFleetHeterProgram(unittest.TestCase):
                 size=400,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(concated.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(concated.shape[1])
                     )
                 ),
                 name="fc1",
@@ -116,8 +116,8 @@ class TestDistFleetHeterProgram(unittest.TestCase):
                 size=400,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(fc1.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(fc1.shape[1])
                     )
                 ),
                 name="fc2",
@@ -129,8 +129,8 @@ class TestDistFleetHeterProgram(unittest.TestCase):
                 size=400,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(fc2.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(fc2.shape[1])
                     )
                 ),
                 name="fc3",
@@ -142,8 +142,8 @@ class TestDistFleetHeterProgram(unittest.TestCase):
                 size=2,
                 activation="softmax",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(fc3.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(fc3.shape[1])
                     )
                 ),
             )
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py
index b60ff0db63e7dd5c67b23fb3f18d3c2564e9d6a0..da63b75f50fa4057baac742403e8efbc88558623 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py
@@ -77,7 +77,7 @@ class TestPSMinimize(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ class TestPSMinimize(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ class TestPSMinimize(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ class TestPSMinimize(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ class TestPSMinimize(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ class TestPSMinimize(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
index a330b45b52228c8bc5ad2ddeeee7187a1705d2d1..ea30485e5aba0038bf8182a70e5f89199c623a4e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -78,7 +78,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -93,7 +93,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -141,7 +141,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -156,7 +156,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
index 2143dc94d39e043e1a282fc679c794a231e60a01..861e015568370758bec8af30ba6ad95d7b6ae24d 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -77,7 +77,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
index bee3cd9eb2239e48d6df210eb1511445d25fbf1c..1ab2d5178241b5c00e62536d71b9a3e1185ea78c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -80,7 +80,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -94,7 +94,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -124,7 +124,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -139,7 +139,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -153,7 +153,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
index 58248d325b1452e0525f68f20276017e7ad7e814..b17451098f405553c5c4e58baf41b59bd72cab35 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
@@ -81,7 +81,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -95,7 +95,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -111,7 +111,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -140,7 +140,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -154,7 +154,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index e207fb859de54a49a39977e6fdecad9938282a60..c9e6cb2035d699c6e58a913052374256f3f66a8b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -80,7 +80,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -95,7 +95,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -111,7 +111,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -140,7 +140,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -154,7 +154,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
index 4093fc34cc998417fd0a187a43f2f42a84777791..2a5f845b93b6428e6034fd74d033bbdf2c51c0a3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -78,7 +78,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -93,7 +93,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -141,7 +141,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -156,7 +156,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index 025b3e90b37d46a0f42eccafd8d6700a616d788c..094ea329672054995aef1aaa31237984b73116dc 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -77,7 +77,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index 51bf54b3241b488549012ef82626c5fc707e4ba9..40abc45e0ab32e7e35075dce94d1636a2858efa7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -78,7 +78,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -93,7 +93,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -141,7 +141,7 @@ class TestPSPassWithBow(unittest.TestCase):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__tmp_",
                 learning_rate=emb_lr,
             ),
@@ -156,7 +156,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
index 165a8b6240aafac399dee3ad1803bc75b82b385c..a5811d4e0f12abb891119ab57297f3c95d1f3f65 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -77,7 +77,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ class TestPSPassWithBow(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ class TestPSPassWithBow(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
index 517232fa54eb84647b01cd5d0e42822d5d24768a..fae692f8fd57c7e5c35afe12d7cb600dccda59f2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
@@ -211,7 +211,7 @@ class TestDistMnistAsync2x2WithGauss(TestFleetBase):
             datas = [dnn_data, lr_data, label]
 
             inference = True
-            init = fluid.initializer.Uniform()
+            init = paddle.nn.initializer.Uniform()
 
             dnn_layer_dims = [128, 64, 32]
             dnn_embedding = fluid.contrib.layers.sparse_embedding(
@@ -232,7 +232,7 @@ class TestDistMnistAsync2x2WithGauss(TestFleetBase):
                     size=dim,
                     activation="relu",
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=0.01)
+                        initializer=paddle.nn.initializer.Constant(value=0.01)
                     ),
                     name='dnn-fc-%d' % i,
                 )
@@ -245,7 +245,7 @@ class TestDistMnistAsync2x2WithGauss(TestFleetBase):
                 is_test=inference,
                 param_attr=fluid.ParamAttr(
                     name="wide_embedding",
-                    initializer=fluid.initializer.Constant(value=0.01),
+                    initializer=paddle.nn.initializer.Constant(value=0.01),
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py
index ba6e67a035095e08c0430a04c4e6e149bdc42511..ebcbfb9e4c4a680a11531dab1f0c3a8623beb44a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py
@@ -75,7 +75,7 @@ class TestSPMT(unittest.TestCase):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -89,7 +89,7 @@ class TestSPMT(unittest.TestCase):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -105,7 +105,7 @@ class TestSPMT(unittest.TestCase):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -119,7 +119,7 @@ class TestSPMT(unittest.TestCase):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -134,7 +134,7 @@ class TestSPMT(unittest.TestCase):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -148,7 +148,7 @@ class TestSPMT(unittest.TestCase):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
index 368be77fdbbfb923789ac1a4c87b26b6b3d4f7ad..5ab7ad21dbdc9371a16271fe64140148f0ec1c9c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
@@ -38,9 +38,7 @@ class SparseLoadOp(unittest.TestCase):
                 size=[10, 10],
                 param_attr=fluid.ParamAttr(
                     name="embedding",
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        emb_array
-                    ),
+                    initializer=paddle.nn.initializer.Assign(emb_array),
                 ),
             )
 
@@ -50,9 +48,7 @@ class SparseLoadOp(unittest.TestCase):
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
                     name='fc',
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        fc_array
-                    ),
+                    initializer=paddle.nn.initializer.Assign(fc_array),
                 ),
             )
             loss = paddle.mean(fc1)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 828b07baf7bbc9220a40a508bd5d8eb5eec7d613..548f2bf8a0c83d56a03b345dad24545b775e5a30 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -87,7 +87,9 @@ class TestSendOp(unittest.TestCase):
                     dtype='float32',
                     name="X",
                 )
-                fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                paddle.nn.initializer.Constant(value=1.0)(
+                    x, main.global_block()
+                )
                 ops._scale(x=x, scale=10.0, out=out_var)
 
         self.server_exe = fluid.Executor(place)
@@ -108,7 +110,7 @@ class TestSendOp(unittest.TestCase):
 
             x = paddle.static.data(shape=[32, 32], dtype='float32', name='X')
             x.persistable = True
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            paddle.nn.initializer.Constant(value=2.3)(x, main.global_block())
 
             get_var = main.global_block().create_var(
                 name="scale_0.tmp_0",  # server side var
@@ -116,7 +118,9 @@ class TestSendOp(unittest.TestCase):
                 persistable=False,
                 shape=[32, 32],
             )
-            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            paddle.nn.initializer.Constant(value=2.3)(
+                get_var, main.global_block()
+            )
 
             # NOTE(zjl): `Send` is async send, which means that the sent
             # variable would be needed even though `Send` op runs.
@@ -135,7 +139,7 @@ class TestSendOp(unittest.TestCase):
         main = fluid.Program()
         with fluid.program_guard(main):
             x = paddle.static.data(shape=[32, 32], dtype='float32', name='X')
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            paddle.nn.initializer.Constant(value=2.3)(x, main.global_block())
             o = paddle.scale(x=x, scale=10.0)
         exe = fluid.Executor(place)
         self.local_out = exe.run(main, fetch_list=[o])
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index e9b8f773c743bc8cef82694f2e1a682df3509620..e79a2f7276c00be600ec8567cd3c2a9f0653b082 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -356,7 +356,9 @@ class TestFakeInit(TranspilerTest):
             size=[dict_size, embedding_size],
             param_attr=fluid.ParamAttr(
                 name='emb',
-                initializer=fluid.initializer.Uniform(-init_width, init_width),
+                initializer=paddle.nn.initializer.Uniform(
+                    -init_width, init_width
+                ),
             ),
         )
 
@@ -365,7 +367,8 @@ class TestFakeInit(TranspilerTest):
             is_sparse=True,
             size=[dict_size, embedding_size],
             param_attr=fluid.ParamAttr(
-                name='emb_w', initializer=fluid.initializer.Constant(value=0.0)
+                name='emb_w',
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             ),
         )
 
@@ -374,7 +377,8 @@ class TestFakeInit(TranspilerTest):
             is_sparse=True,
             size=[dict_size, 1],
             param_attr=fluid.ParamAttr(
-                name='emb_b', initializer=fluid.initializer.Constant(value=0.0)
+                name='emb_b',
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             ),
         )
 
@@ -1327,7 +1331,7 @@ class TestRemoteNce(TestDistLookupTableBase):
                 shape=[num_total_classes, 10],
                 dtype='float32',
                 name='nce_w',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
         b_param = (
@@ -1337,7 +1341,7 @@ class TestRemoteNce(TestDistLookupTableBase):
                 shape=[num_total_classes, 1],
                 dtype='float32',
                 name='nce_b',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
 
@@ -1405,7 +1409,7 @@ class TestRemoteHsigmoid(TestDistLookupTableBase):
                 shape=[num_total_classes, 10],
                 dtype='float32',
                 name='hs_w',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
         b_param = (
@@ -1415,7 +1419,7 @@ class TestRemoteHsigmoid(TestDistLookupTableBase):
                 shape=[3, 1],
                 dtype='float32',
                 name='hs_b',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
 
@@ -1424,7 +1428,7 @@ class TestRemoteHsigmoid(TestDistLookupTableBase):
             is_sparse=is_sparse,
             size=[3, 3],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Normal(
+                initializer=paddle.nn.initializer.Normal(
                     scale=1 / math.sqrt(num_total_classes)
                 )
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py
index d32057bfb0d2dd893722422775b64bbe6670b50d..aa61c1e177869024c23b8246049df894b6836357 100644
--- a/python/paddle/fluid/tests/unittests/test_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dot_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -38,7 +38,7 @@ class DotOp(OpTest):
         self.attrs = {}
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad_normal(self):
         if core.is_compiled_with_rocm():
@@ -46,10 +46,12 @@ class DotOp(OpTest):
                 ['X', 'Y'],
                 'Out',
                 user_defined_grads=[self.inputs['Y'], self.inputs['X']],
-                check_eager=True,
             )
         else:
-            self.check_grad(['X', 'Y'], 'Out', check_eager=True)
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+            )
 
     def test_check_grad_ingore_x(self):
         if core.is_compiled_with_rocm():
@@ -58,11 +60,12 @@ class DotOp(OpTest):
                 'Out',
                 no_grad_set=set("X"),
                 user_defined_grads=[self.inputs['X']],
-                check_eager=True,
             )
         else:
             self.check_grad(
-                ['Y'], 'Out', no_grad_set=set("X"), check_eager=True
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
             )
 
     def test_check_grad_ingore_y(self):
@@ -72,11 +75,12 @@ class DotOp(OpTest):
                 'Out',
                 no_grad_set=set('Y'),
                 user_defined_grads=[self.inputs['Y']],
-                check_eager=True,
             )
         else:
             self.check_grad(
-                ['X'], 'Out', no_grad_set=set('Y'), check_eager=True
+                ['X'],
+                'Out',
+                no_grad_set=set('Y'),
             )
 
     def init_input_output(self):
@@ -187,7 +191,7 @@ class TestComplexDotOp(OpTest):
         self.grad_y = self.grad_out * np.conj(self.x)
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -195,7 +199,6 @@ class TestComplexDotOp(OpTest):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True,
         )
 
     def test_check_grad_ingore_x(self):
@@ -205,7 +208,6 @@ class TestComplexDotOp(OpTest):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -215,13 +217,13 @@ class TestComplexDotOp(OpTest):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True,
         )
 
 
 class TestComplexDotOp2D(OpTest):
     def setUp(self):
         self.op_type = "dot"
+        self.python_api = paddle.dot
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
index a12a17636bfc14dd2e83f365704421112d309967..46977b13d77001825891bdb7e3670433ef3362cf 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -41,7 +41,7 @@ def simple_fc_net():
             size=200,
             activation='tanh',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index 5657eb174c30331838ece0977d6da423524a4323..05df1e96d75054ffe3f1c19f0fd1380d7d1187ed 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -129,7 +129,7 @@ def lm_model(
                 [hidden_size * 2, hidden_size * 4],
                 dtype="float32",
                 name="fc_weight1_" + str(i),
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             )
@@ -138,7 +138,7 @@ def lm_model(
                 [hidden_size * 4],
                 dtype="float32",
                 name="fc_bias1_" + str(i),
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             bias_arr.append(bias_1)
 
@@ -250,7 +250,7 @@ def lm_model(
                 [hidden_size * 2, hidden_size * 4],
                 dtype="float32",
                 name="fc_weight1_" + str(i),
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             )
@@ -259,7 +259,7 @@ def lm_model(
                 [hidden_size * 4],
                 dtype="float32",
                 name="fc_bias1_" + str(i),
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             bias_arr.append(bias_1)
 
@@ -368,7 +368,7 @@ def lm_model(
         is_sparse=False,
         param_attr=fluid.ParamAttr(
             name='embedding_para',
-            initializer=fluid.initializer.UniformInitializer(
+            initializer=paddle.nn.initializer.Uniform(
                 low=-init_scale, high=init_scale
             ),
         ),
@@ -406,7 +406,7 @@ def lm_model(
         [hidden_size, vocab_size],
         dtype="float32",
         name="softmax_weight",
-        default_initializer=fluid.initializer.UniformInitializer(
+        default_initializer=paddle.nn.initializer.Uniform(
             low=-init_scale, high=init_scale
         ),
     )
@@ -414,7 +414,7 @@ def lm_model(
         [vocab_size],
         dtype="float32",
         name='softmax_bias',
-        default_initializer=fluid.initializer.UniformInitializer(
+        default_initializer=paddle.nn.initializer.Uniform(
             low=-init_scale, high=init_scale
         ),
     )
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index bd4e08819570f7838eeb51fa4861087a8d134d87..d9ce93c913017fb27025decc9ca0a9beba8e7e39 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -312,7 +312,7 @@ class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='W',
-                    initializer=fluid.initializer.ConstantInitializer(1.0),
+                    initializer=paddle.nn.initializer.Constant(1.0),
                 ),
                 bias_attr=False,
             )
@@ -321,7 +321,7 @@ class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='U',
-                    initializer=fluid.initializer.ConstantInitializer(0.0),
+                    initializer=paddle.nn.initializer.Constant(0.0),
                 ),
                 bias_attr=False,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
index 33472f85e7327fb652585b941222f55c4fd4d1cb..4863f46f48761b7eb053d0c1ace23d5db363a55d 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_run_program.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -105,7 +105,7 @@ class TestRunProgram(unittest.TestCase):
         )
         backward_program = _add_build_strategy_for(
             program,
-            main_program.desc.block(0).op_size() + 2,
+            main_program.desc.block(0).op_size() + 1,
             program.desc.block(0).op_size(),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 9f05d354c463de0a3ad8cc8592f3131bc296ad35..3fb03ac89f0d7979d9b459b6b6d7986b2fae4148 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -911,19 +911,19 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
             1,
             3,
             bias_attr=False,
-            weight_attr=paddle.fluid.initializer.Uniform(),
+            weight_attr=paddle.nn.initializer.Uniform(),
         )
         linear3 = paddle.nn.Linear(
             1,
             3,
             bias_attr=False,
-            weight_attr=paddle.fluid.initializer.TruncatedNormalInitializer(),
+            weight_attr=paddle.nn.initializer.TruncatedNormal(),
         )
         linear4 = paddle.nn.Linear(
             1,
             3,
             bias_attr=False,
-            weight_attr=paddle.fluid.initializer.MSRAInitializer(),
+            weight_attr=paddle.nn.initializer.KaimingUniform(),
         )
         res = [
             linear1.weight.numpy(),
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 3bf2b7cdcd703583e2cf0e00a9542b6130acef22..502ca504c1b8ebbe8cd3e4cc34cb62d64510bc85 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -15,15 +15,18 @@
 import unittest
 
 import numpy as np
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    skip_check_grad_ci,
-)
+
+
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def add_wrapper(x, y, axis=-1):
+        return x + y.reshape(shape)
+
+    return add_wrapper
 
 
 class TestElementwiseAddOp(OpTest):
@@ -45,14 +48,13 @@ class TestElementwiseAddOp(OpTest):
         self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': self.out}
 
-    def check_eager(self):
+    def check_dygraph(self):
         return not self.use_mkldnn and self.axis == -1
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output(
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def test_check_grad_normal(self):
@@ -62,8 +64,7 @@ class TestElementwiseAddOp(OpTest):
         self.check_grad(
             ['X', 'Y'],
             'Out',
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_x(self):
@@ -74,8 +75,7 @@ class TestElementwiseAddOp(OpTest):
             ['Y'],
             'Out',
             no_grad_set=set("X"),
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_y(self):
@@ -86,8 +86,7 @@ class TestElementwiseAddOp(OpTest):
             ['X'],
             'Out',
             no_grad_set=set('Y'),
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def init_input_output(self):
@@ -136,7 +135,8 @@ class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 self.check_output_with_place(
-                    place, atol=1e-3, check_dygraph=(not self.use_mkldnn)
+                    place,
+                    atol=1e-3,
                 )
 
 
@@ -149,6 +149,7 @@ class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
 class TestBF16ElementwiseAddOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_add"
+        self.python_api = paddle.add
         self.dtype = np.uint16
 
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
@@ -170,23 +171,19 @@ class TestBF16ElementwiseAddOp(OpTest):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=False)
+        self.check_output_with_place(place)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_eager=False)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', no_grad_set=set("X"), check_eager=False
-        )
+        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False
-        )
+        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
 
 
 @skip_check_grad_ci(
@@ -248,6 +245,7 @@ class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -258,6 +256,7 @@ class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -268,6 +267,7 @@ class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 100, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -278,6 +278,7 @@ class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 100, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -288,6 +289,7 @@ class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 1, 100)
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
 
 
 class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
@@ -295,6 +297,7 @@ class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 1, 100)
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
 
 
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
@@ -302,6 +305,7 @@ class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
         self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 10, 12, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -312,6 +316,7 @@ class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 10, 12, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -322,6 +327,7 @@ class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -332,6 +338,7 @@ class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -597,6 +604,7 @@ class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError):
 class TestComplexElementwiseAddOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_add"
+        self.python_api = paddle.add
         self.dtype = np.float64
         self.shape = (2, 3, 4, 5)
         self.init_input_output()
@@ -629,7 +637,7 @@ class TestComplexElementwiseAddOp(OpTest):
         self.grad_y = self.grad_out
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 943486827237d9f36e5fb23e6ce8566ed1a14d2b..c17a41b0bfad58e80f438da5ec7aacddf2bac5f1 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -15,13 +15,20 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import fluid
 from paddle.fluid import core
 
 
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def div_wrapper(x, y, axis=-1):
+        return paddle.divide(x, y.reshape(shape))
+
+    return div_wrapper
+
+
 class ElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
@@ -193,6 +200,7 @@ class TestElementwiseDivOpBroadcast0(ElementwiseDivOp):
         self.x_shape = [100, 3, 4]
         self.y_shape = [100]
         self.attrs = {'axis': 0}
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
 
     def compute_output(self, x, y):
         return x / y.reshape(100, 1, 1)
@@ -209,6 +217,7 @@ class TestElementwiseDivOpBroadcast1(ElementwiseDivOp):
         self.x_shape = [2, 100, 4]
         self.y_shape = [100]
         self.attrs = {'axis': 1}
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
 
     def compute_output(self, x, y):
         return x / y.reshape(1, 100, 1)
@@ -224,6 +233,7 @@ class TestElementwiseDivOpBroadcast2(ElementwiseDivOp):
     def init_shape(self):
         self.x_shape = [2, 3, 100]
         self.y_shape = [100]
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
 
     def compute_output(self, x, y):
         return x / y.reshape(1, 1, 100)
@@ -240,6 +250,7 @@ class TestElementwiseDivOpBroadcast3(ElementwiseDivOp):
         self.x_shape = [2, 10, 12, 5]
         self.y_shape = [10, 12]
         self.attrs = {'axis': 1}
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
 
     def compute_output(self, x, y):
         return x / y.reshape(1, 10, 12, 1)
@@ -393,7 +404,7 @@ class TestComplexElementwiseDivOp(OpTest):
         self.grad_y = -self.grad_out * np.conj(self.x / self.y / self.y)
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
index c9835b5cb1566f671154a191f34ac9912bc20b50..02f1d1dd6d275b89e10261dab6e60e3af1261668 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
@@ -25,6 +25,13 @@ from paddle import _legacy_C_ops
 paddle.enable_static()
 
 
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def min_wrapper(x, y, axis=-1):
+        return paddle.minimum(x, y.reshape(shape))
+
+    return min_wrapper
+
+
 class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_min"
@@ -39,16 +46,10 @@ class TestElementwiseOp(OpTest):
         self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
 
     def test_check_output(self):
-        if hasattr(self, 'attrs'):
-            self.check_output(check_eager=False)
-        else:
-            self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad_normal(self):
-        if hasattr(self, 'attrs'):
-            self.check_grad(['X', 'Y'], 'Out', check_eager=False)
-        else:
-            self.check_grad(['X', 'Y'], 'Out', check_eager=True)
+        self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -118,7 +119,7 @@ class TestElementwiseMinOp_Vector(TestElementwiseOp):
 class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
         x = np.random.uniform(0.5, 1, (100, 3, 2)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (100,)).astype(np.float64)
         y = x[:, 0, 0] + sgn * np.random.uniform(1, 2, (100,)).astype(
@@ -137,7 +138,7 @@ class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
 class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
         x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (100,)).astype(np.float64)
         y = x[0, :, 0] + sgn * np.random.uniform(1, 2, (100,)).astype(
@@ -156,7 +157,7 @@ class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
 class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
         x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (100,)).astype(np.float64)
         y = x[0, 0, :] + sgn * np.random.uniform(1, 2, (100,)).astype(
@@ -174,7 +175,7 @@ class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
 class TestElementwiseMinOp_broadcast_3(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[1, 25, 4, 1])
         x = np.random.uniform(0.5, 1, (2, 25, 4, 1)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (25, 4)).astype(np.float64)
         y = x[0, :, :, 0] + sgn * np.random.uniform(1, 2, (25, 4)).astype(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index e34d9d0dfd32bf7c594b189736bfb22caae22d4f..4fe6a15ef8efc15f78946c248a1ea155a4e896e0 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -15,14 +15,24 @@
 import unittest
 
 import numpy as np
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    skip_check_grad_ci,
-)
+
+
+def mul(x, y, axis=-1, use_mkldnn=False):
+    return x * y
+
+
+setattr(paddle, "mul", mul)
+
+
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def mul_wrapper(x, y, axis=-1):
+        return x * y.reshape(shape)
+
+    return mul_wrapper
 
 
 class ElementwiseMulOp(OpTest):
@@ -31,6 +41,7 @@ class ElementwiseMulOp(OpTest):
 
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.dtype = np.float64
         self.axis = -1
         self.init_dtype()
@@ -107,6 +118,7 @@ class TestElementwiseMulOp_ZeroDim3(ElementwiseMulOp):
 class TestBF16ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.dtype = np.uint16
 
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
@@ -145,6 +157,7 @@ class TestBF16ElementwiseMulOp(OpTest):
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 3, 4).astype(np.float64),
             'Y': np.random.rand(1).astype(np.float64),
@@ -156,6 +169,7 @@ class TestElementwiseMulOp_scalar(ElementwiseMulOp):
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.random((100,)).astype("float64"),
             'Y': np.random.random((100,)).astype("float64"),
@@ -168,6 +182,7 @@ class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
         self.out = self.x * self.y.reshape(100, 1, 1)
 
     def init_axis(self):
@@ -177,6 +192,7 @@ class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
         self.inputs = {
             'X': np.random.rand(2, 100, 3).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -192,6 +208,7 @@ class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
 class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -206,6 +223,7 @@ class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
 class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
         self.inputs = {
             'X': np.random.rand(2, 10, 12, 3).astype(np.float64),
             'Y': np.random.rand(10, 12).astype(np.float64),
@@ -221,6 +239,7 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
 class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 2, 11).astype(np.float64),
             'Y': np.random.rand(10, 1, 11).astype(np.float64),
@@ -232,6 +251,7 @@ class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
 class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 4, 2, 3).astype(np.float64),
             'Y': np.random.rand(10, 4, 1, 3).astype(np.float64),
@@ -251,6 +271,7 @@ class TestElementwiseMulOpFp16(ElementwiseMulOp):
 class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(1, 1, 100).astype(np.float64),
@@ -262,6 +283,7 @@ class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
 class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(30, 3, 1, 5).astype(np.float64),
             'Y': np.random.rand(30, 1, 4, 1).astype(np.float64),
@@ -273,6 +295,7 @@ class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
 class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 10).astype(np.float64),
             'Y': np.random.rand(2, 2, 10, 10).astype(np.float64),
@@ -289,6 +312,7 @@ class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
 class TestComplexElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 398ef711e2feaa0c162a3ad7a76fbdb93d9bed8d..1391dd2e9da5eb7e8d49007ff24a75defb15f58f 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -15,15 +15,26 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
 
 
+def sub_wrapper(shape=None):
+    def inner_wrapper(x, y, axis=-1):
+        if shape is None:
+            return x - y
+        else:
+            return x - y.reshape(shape)
+
+    return inner_wrapper
+
+
 class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
             'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
@@ -50,6 +61,7 @@ class TestElementwiseOp(OpTest):
 class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, []).astype("float64"),
             'Y': np.random.uniform(0.1, 1, []).astype("float64"),
@@ -60,6 +72,7 @@ class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp):
 class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
             'Y': np.random.uniform(0.1, 1, []).astype("float64"),
@@ -70,6 +83,7 @@ class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp):
 class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, []).astype("float64"),
             'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
@@ -80,6 +94,7 @@ class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp):
 class TestBF16ElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.dtype = np.uint16
         x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
         y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
@@ -110,6 +125,7 @@ class TestBF16ElementwiseOp(OpTest):
 class TestElementwiseSubOp_scalar(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(10, 3, 4).astype(np.float64),
             'Y': np.random.rand(1).astype(np.float64),
@@ -120,6 +136,7 @@ class TestElementwiseSubOp_scalar(TestElementwiseOp):
 class TestElementwiseSubOp_Vector(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.random((100,)).astype("float64"),
             'Y': np.random.random((100,)).astype("float64"),
@@ -130,6 +147,7 @@ class TestElementwiseSubOp_Vector(TestElementwiseOp):
 class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[100, 1, 1])
         self.inputs = {
             'X': np.random.rand(100, 3, 2).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -144,6 +162,7 @@ class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
 class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[1, 100, 1])
         self.inputs = {
             'X': np.random.rand(2, 100, 3).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -158,6 +177,7 @@ class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
 class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[1, 1, 100])
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -171,6 +191,7 @@ class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
 class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[1, 10, 12, 1])
         self.inputs = {
             'X': np.random.rand(2, 10, 12, 3).astype(np.float64),
             'Y': np.random.rand(10, 12).astype(np.float64),
@@ -185,6 +206,7 @@ class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
 class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(2, 5, 3, 12).astype(np.float64),
             'Y': np.random.rand(2, 5, 1, 12).astype(np.float64),
@@ -195,6 +217,7 @@ class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
 class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(1, 1, 100).astype(np.float64),
@@ -205,6 +228,7 @@ class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
 class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(10, 3, 1, 4).astype(np.float64),
             'Y': np.random.rand(10, 1, 12, 1).astype(np.float64),
@@ -215,6 +239,11 @@ class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
 class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+
+        def sub_func(x, y, axis=2):
+            return x.reshape([1, 1, 10, 12]) - y
+
+        self.python_api = sub_func
         self.inputs = {
             'X': np.random.rand(10, 12).astype(np.float64),
             'Y': np.random.rand(2, 3, 10, 12).astype(np.float64),
@@ -230,6 +259,7 @@ class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
 class TestComplexElementwiseSubOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.dtype = np.float64
         self.shape = (2, 3, 4, 5)
         self.init_input_output()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 9fa333d623bf918df50c6f742ed23c425953f8c9..d87b47270d4fa70afd3deeeb86fcef2fe795cbab 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -15,15 +15,21 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid.core as core
 
 
+def fill_any_like_wrapper(x, value):
+    x.fill_(value)
+    return x
+
+
 class TestFillAnyLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_any_like"
+        self.python_api = fill_any_like_wrapper
         self.dtype = np.int32
         self.value = 0.0
         self.init()
@@ -50,6 +56,7 @@ class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
 class TestFillAnyLikeOpBfloat16(OpTest):
     def setUp(self):
         self.op_type = "fill_any_like"
+        self.python_api = fill_any_like_wrapper
         self.dtype = np.uint16
         self.value = 0.0
         self.inputs = {'X': np.random.random((219, 232)).astype(np.float32)}
@@ -83,6 +90,7 @@ class TestFillAnyLikeOpValue3(TestFillAnyLikeOp):
 class TestFillAnyLikeOpType(TestFillAnyLikeOp):
     def setUp(self):
         self.op_type = "fill_any_like"
+        self.python_api = fill_any_like_wrapper
         self.dtype = np.int32
         self.value = 0.0
         self.init()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 38ef0379747dbaa5aa2709efbc3033b9b1966b55..3151744aa4cc1f08a19437860598337c32059fd4 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid as fluid
@@ -24,11 +24,17 @@ from paddle.fluid import Program, program_guard
 from paddle.fluid.op import Operator
 
 
+def fill_wrapper(shape, value=0.0):
+    out = paddle.full(shape=shape, fill_value=value)
+    return out
+
+
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestFillConstantOp1(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92], 'value': 3.8}
@@ -42,6 +48,7 @@ class TestFillConstantOp2(OpTest):
     def setUp(self):
         '''Test fill_constant op with default value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92]}
@@ -55,6 +62,7 @@ class TestFillConstantOp3(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified int64 value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92], 'value': 10000000000}
@@ -68,6 +76,7 @@ class TestFillConstantOp4(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified int value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92], 'value': 3}
@@ -84,6 +93,7 @@ class TestFillConstantBF16Op(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.dtype = np.uint16
         self.inputs = {}
         self.attrs = {
@@ -130,6 +140,7 @@ class TestFillConstantOp1_ShapeTensorList(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
         shape_tensor_list = []
         for index, ele in enumerate(self.shape):
@@ -154,6 +165,7 @@ class TestFillConstantOp2_ShapeTensorList(OpTest):
     def setUp(self):
         '''Test fill_constant op with default value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
         shape_tensor_list = []
         for index, ele in enumerate(self.shape):
@@ -192,6 +204,7 @@ class TestFillConstantOp1_ShapeTensor(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
@@ -211,6 +224,7 @@ class TestFillConstantOp1_ValueTensor(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {
@@ -234,6 +248,7 @@ class TestFillConstantOp2_ValueTensor(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {
@@ -452,6 +467,7 @@ class TestFillConstantOp_ValueTensorBf16(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {
@@ -470,7 +486,8 @@ class TestFillConstantOp_ValueTensorBf16(OpTest):
         self.mkldnn_data_type = "bfloat16"
 
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
+        # no dynamic graph test for mkldnn
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
index 1ca8c869a96bdfbc2847df0aa81101a28a9e3042..f6418cdee2ccecda52f94b74c388233b3b8d8032 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
@@ -165,19 +165,24 @@ class TestFleetExecutor(unittest.TestCase):
             lazy_initialize=True,
         )
 
+        infinite_buff_size = -1
         task_a.add_downstream_task(task_b.task_id(), 2)
         task_b.add_upstream_task(task_a.task_id(), 2)
-        task_b.add_downstream_task(task_c.task_id(), 100)
-        task_c.add_upstream_task(task_b.task_id(), 100)
+        task_b.add_downstream_task(task_c.task_id(), infinite_buff_size)
+        task_c.add_upstream_task(task_b.task_id(), infinite_buff_size)
         task_c.add_downstream_task(task_d.task_id(), 2)
         task_d.add_upstream_task(task_c.task_id(), 2)
-        task_d.add_downstream_task(task_b.task_id(), 100, core.DependType.LOOP)
-        task_b.add_upstream_task(task_d.task_id(), 100, core.DependType.LOOP)
+        task_d.add_downstream_task(
+            task_b.task_id(), infinite_buff_size, core.DependType.LOOP
+        )
+        task_b.add_upstream_task(
+            task_d.task_id(), infinite_buff_size, core.DependType.LOOP
+        )
         task_b.add_downstream_task(
-            task_e.task_id(), 100, core.DependType.STOP_LOOP
+            task_e.task_id(), infinite_buff_size, core.DependType.STOP_LOOP
         )
         task_e.add_upstream_task(
-            task_b.task_id(), 100, core.DependType.STOP_LOOP
+            task_b.task_id(), infinite_buff_size, core.DependType.STOP_LOOP
         )
 
         main_program._pipeline_opt = {
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index ce83ba62acb97c8155bcb85ce4731b404ebfaf19..00cc6c07aac8bca521bad0e59e84e0db046db2ce 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -19,7 +19,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -97,10 +96,10 @@ class TestFunctionalConv2D(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=self.act,
                     data_format=self.data_format,
                 )
@@ -515,10 +514,10 @@ class TestFunctionalConv2DErrorCase12(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index f45cf48afbf0dd24d20db9b72f13c8cf4c962d1c..2981748cf61782a8986806a58c2801701b8c72a9 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -19,7 +19,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -99,10 +98,10 @@ class TestFunctionalConv2D(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     data_format=self.data_format,
                 )
         exe = fluid.Executor(self.place)
@@ -523,10 +522,10 @@ class TestFunctionalConv2DErrorCase10(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index bdd8360f9717452f92cd4f881f280a6777d37a59..62322f8e3dc8f9114502ef4d1140fb12cf3dc7a4 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -19,7 +19,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -97,10 +96,10 @@ class TestFunctionalConv3D(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=self.act,
                     data_format=self.data_format,
                 )
@@ -490,10 +489,10 @@ class TestFunctionalConv3DErrorCase11(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
index ae402c874e6398453273ebff67ea1b8eb31e1312..7a8549b1240aacdaaf64b1fc034d0f8a7c4802b0 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
@@ -19,7 +19,6 @@ import numpy as np
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -99,10 +98,10 @@ class TestFunctionalConv3DTranspose(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=self.act,
                     data_format=self.data_format,
                 )
@@ -548,10 +547,10 @@ class TestFunctionalConv3DTransposeErrorCase10(TestCase):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 4d7fb60d4660e16947bb62734d66cd48d36691f0..83574bae6b462d59a7d3313696e96288dc8b73fb 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -37,11 +37,11 @@ class TestFuseBatchNormActPass(unittest.TestCase):
             )
             param_attr = fluid.ParamAttr(
                 name='batch_norm_w',
-                initializer=fluid.initializer.Constant(value=1.0),
+                initializer=paddle.nn.initializer.Constant(value=1.0),
             )
             bias_attr = fluid.ParamAttr(
                 name='batch_norm_b',
-                initializer=fluid.initializer.Constant(value=0.0),
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             )
             hidden2 = paddle.static.nn.batch_norm(
                 input=hidden1,
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index d981ccbe14ccbafd3bc7ec1f9a54316965c37908..c00f10d91d4b43bd69da652003e43409a48dc927 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -31,33 +31,33 @@ class TestFusedBnAddActAPI(unittest.TestCase):
     def setUp(self):
         self.conv_param_attr1 = fluid.ParamAttr(
             name='conv2d_1.weight',
-            initializer=fluid.initializer.Xavier(uniform=False),
+            initializer=paddle.nn.initializer.XavierNormal(),
             learning_rate=0.001,
         )
         self.conv_param_attr2 = fluid.ParamAttr(
             name='conv2d_2.weight',
-            initializer=fluid.initializer.Xavier(uniform=False),
+            initializer=paddle.nn.initializer.XavierNormal(),
             learning_rate=0.001,
         )
         self.bn_param_attr1 = fluid.ParamAttr(
             name='batch_norm_w_1',
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         self.bn_bias_attr1 = fluid.ParamAttr(
             name='batch_norm_b_1',
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
         self.bn_param_attr2 = fluid.ParamAttr(
             name='batch_norm_w_2',
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         self.bn_bias_attr2 = fluid.ParamAttr(
             name='batch_norm_b_2',
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
         self.fc_param_attr = fluid.ParamAttr(
             name='fc.weight',
-            initializer=fluid.initializer.Xavier(uniform=False),
+            initializer=paddle.nn.initializer.XavierNormal(),
         )
 
     def build_fused_program(
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py b/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py
index c459f2dbb22e66c06bf5d0e8568c8f289c11dd9e..4f18abd79e0fee7809171d36f1117e73f5b0d55b 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py
@@ -192,5 +192,17 @@ class TestFusedAttentionNormalizeBefore(TestFusedAttention):
         self.normalize_before = True
 
 
+class TestFusedAttentionAPIError(unittest.TestCase):
+    def test_invalid_x_rank(self):
+        def test_x_rank_1():
+            with paddle.fluid.dygraph.guard():
+                layer = FusedMultiHeadAttention(embed_dim=1, num_heads=1)
+                array = np.array([1.9], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [1]), dtype='float32')
+                out = layer(x)
+
+        self.assertRaises(ValueError, test_x_rank_1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py b/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
index cce05d8747cdf87128725379c79266e296ece4e4..98085c223a0cbf9ba0bab9625cba4882432bfca5 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
@@ -31,7 +31,6 @@ class MultiHeadAttention(paddle.nn.Layer):
         num_heads,
         add_residual=True,
         pre_ln=True,
-        post_ln=False,
         attn_dropout=True,
     ):
         super(MultiHeadAttention, self).__init__()
@@ -42,7 +41,6 @@ class MultiHeadAttention(paddle.nn.Layer):
 
         self.add_residual = add_residual
         self.pre_ln = pre_ln
-        self.post_ln = post_ln
         self.attn_dropout = attn_dropout
 
         self.head_dim = embed_dim // num_heads
@@ -90,7 +88,7 @@ class MultiHeadAttention(paddle.nn.Layer):
         if self.add_residual:
             out = residual + out
 
-        if self.post_ln:
+        if not self.pre_ln:
             # post layer norm
             out = self.norm2(out)
 
@@ -104,7 +102,6 @@ class TestFusedAttentionPass(unittest.TestCase):
     def setUp(self):
         self.add_residual = True
         self.pre_ln = True
-        self.post_ln = True
         self.attn_dropout = True
         self.add_mask = True
 
@@ -120,6 +117,7 @@ class TestFusedAttentionPass(unittest.TestCase):
         ).astype('float32')
 
         main_prog = paddle.static.Program()
+        main_prog.random_seed = 1234
         startup_prog = paddle.static.Program()
 
         with paddle.static.program_guard(main_prog, startup_prog):
@@ -142,7 +140,6 @@ class TestFusedAttentionPass(unittest.TestCase):
                 num_heads,
                 add_residual=self.add_residual,
                 pre_ln=self.pre_ln,
-                post_ln=self.post_ln,
                 attn_dropout=self.attn_dropout,
             )
 
@@ -157,13 +154,23 @@ class TestFusedAttentionPass(unittest.TestCase):
         pass_manager.apply([main_prog], [startup_prog])
 
         ops = main_prog.global_block().ops
-        assert ops[2].type == 'reduce_mean'
-        assert ops[4].type == 'reduce_mean_grad'
+        assert ops[2].type == 'fused_attention'
+        assert ops[3].type == 'reduce_mean'
+        assert ops[5].type == 'reduce_mean_grad'
+        assert ops[6].type == 'fused_attention_grad'
         # two ops for linear, one op for reduce mean
         # one fill constant
         # one op for reduce mean grad, two ops for linear bwd
         # the eighth op should be the optimizer
-        assert ops[7].type == 'sgd'
+        assert ops[9].type == 'sgd'
+
+        exe = paddle.static.Executor()
+        exe.run(startup_prog)
+        rst = exe.run(
+            main_prog,
+            feed={'x': x_data, 'attn_mask': mask_data},
+            fetch_list=[loss],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index e3da925a01e4294d3556f09713a1fcea2da2cba9..9264c8f2e77c6761e70837a54c0035a6cf0e388d 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -53,7 +53,7 @@ class TestFusedMultiTransformerOp(OpTest):
         self.__class__.no_need_check_grad = False
 
         bias_attr = paddle.fluid.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(value=0.0005)
+            initializer=paddle.paddle.nn.initializer.Constant(value=0.0005)
         )
         self.q_proj = Linear(
             self.embed_dim,
@@ -1027,16 +1027,16 @@ class TestFusedMultiTransformerOpPreCacheStatic(TestFusedMultiTransformerOp):
         self.has_attn_mask = False
         self.x_type = np.float32
         self.weight_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(0.0)
+            initializer=paddle.paddle.nn.initializer.Constant(0.0)
         )
         self.bias_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(0.0005)
+            initializer=paddle.paddle.nn.initializer.Constant(0.0005)
         )
         self.ln_w_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(1.0)
+            initializer=paddle.paddle.nn.initializer.Constant(1.0)
         )
         self.ln_b_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(0.0)
+            initializer=paddle.paddle.nn.initializer.Constant(0.0)
         )
 
     def test_fused_multi_transformer_op(self):
@@ -1051,5 +1051,31 @@ class TestFusedMultiTransformerOpPreCacheStatic(TestFusedMultiTransformerOp):
             )
 
 
+class TestFusedMultiAttentionAPIError(unittest.TestCase):
+    def test_errors(self):
+        def test_invalid_input_dim():
+            array = np.array([1.9], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [1]), dtype='float32')
+            layer = paddle.incubate.nn.FusedMultiHeadAttention(
+                embed_dim=1, num_heads=1
+            )
+            out = layer(x)
+
+        self.assertRaises(ValueError, test_invalid_input_dim)
+
+
+class TestFusedMultiTransformerAPIError(unittest.TestCase):
+    def test_errors(self):
+        def test_invalid_input_dim():
+            array = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [0]), dtype='int32')
+            layer = paddle.incubate.nn.FusedTransformerEncoderLayer(
+                108, 108, 108, 0.0, 'relu'
+            )
+            out = layer(x)
+
+        self.assertRaises(ValueError, test_invalid_input_dim)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index fcbc91edee31eec09cb8cd57b94a19a7d39d7030..b0625050b889fbee81e2208ccda33ca32d466afd 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -62,7 +62,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
                     size=hidden_size,
                     activation='tanh',
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=1.0)
+                        initializer=paddle.nn.initializer.Constant(value=1.0)
                     ),
                 )
 
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 75e5d1ee2ee158528bcc120b347614c0a3d3dc59..abf0ba0ac2650d8283b2979a81d7fac06f811cb7 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -20,7 +20,6 @@ from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 
 paddle.enable_static()
@@ -302,7 +301,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
             is_sparse=is_sparse,
             size=[3, 3],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Normal(scale=1 / math.sqrt(3))
+                initializer=paddle.nn.initializer.Normal(std=1 / math.sqrt(3))
             ),
         )
 
@@ -555,8 +554,8 @@ class TestHSigmoidLossAPI(unittest.TestCase):
             x, labels, self.num_classes, weight, bias, path_table, path_code
         )
 
-        weight_attr = I.NumpyArrayInitializer(self.weight_np)
-        bias_attr = I.NumpyArrayInitializer(self.bias_np)
+        weight_attr = paddle.nn.initializer.Assign(self.weight_np)
+        bias_attr = paddle.nn.initializer.Assign(self.bias_np)
         m = paddle.nn.HSigmoidLoss(
             self.feature_size,
             self.num_classes,
@@ -593,10 +592,10 @@ class TestHSigmoidLossAPI(unittest.TestCase):
             )
 
             weight_attr = paddle.framework.ParamAttr(
-                initializer=I.NumpyArrayInitializer(self.weight_np)
+                initializer=paddle.nn.initializer.Assign(self.weight_np)
             )
             bias_attr = paddle.framework.ParamAttr(
-                initializer=I.NumpyArrayInitializer(self.bias_np)
+                initializer=paddle.nn.initializer.Assign(self.bias_np)
             )
             m = paddle.nn.HSigmoidLoss(
                 self.feature_size,
@@ -636,8 +635,8 @@ class TestHSigmoidLossAPI(unittest.TestCase):
             if self.is_custom:
                 path_table = fluid.data('path_table', [-1, -1], 'int64')
                 path_code = fluid.data('path_code', [-1, -1], 'int64')
-            weight_attr = I.NumpyArrayInitializer(self.weight_np)
-            bias_attr = I.NumpyArrayInitializer(self.bias_np)
+            weight_attr = paddle.nn.initializer.Assign(self.weight_np)
+            bias_attr = paddle.nn.initializer.Assign(self.bias_np)
             loss = paddle.nn.HSigmoidLoss(
                 feature_size=x.shape[1],
                 num_classes=self.num_classes,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index f34c8d6a2a85832a6c3490ebfe05aa9f63f16189..51e32c5259f452e1f3dedb9348eff10820087c46 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -123,7 +123,7 @@ class DeepCF(fluid.Layer):
             shape=matrix.shape,
             dtype=matrix.dtype,
             is_bias=False,
-            default_initializer=fluid.initializer.NumpyArrayInitializer(matrix),
+            default_initializer=paddle.nn.initializer.Assign(matrix),
         )
         self._rating_matrix.stop_gradient = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 0eb037bc6a02e40d8f2734dfd0b051ad996d4204..af6e32ac6b897b7e9d41f1537c97cd7784d75e7a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -47,7 +47,7 @@ class SimpleNet(fluid.Layer):
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -56,7 +56,7 @@ class SimpleNet(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 12be3af2d9cf9164a7a865e4e4de79fd33299a22..5c48252cb0b7fdab719ac40c734771548fe8593a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -77,12 +77,12 @@ class ConvBNPool(fluid.dygraph.Layer):
         filter_size = 3
         conv_std_0 = (2.0 / (filter_size**2 * channels[0])) ** 0.5
         conv_param_0 = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, conv_std_0)
+            initializer=paddle.nn.initializer.Normal(0.0, conv_std_0)
         )
 
         conv_std_1 = (2.0 / (filter_size**2 * channels[1])) ** 0.5
         conv_param_1 = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, conv_std_1)
+            initializer=paddle.nn.initializer.Normal(0.0, conv_std_1)
         )
 
         self.conv_0_layer = paddle.nn.Conv2D(
@@ -200,10 +200,11 @@ class EncoderNet(fluid.dygraph.Layer):
         super().__init__()
         self.rnn_hidden_size = rnn_hidden_size
         para_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, 0.02)
+            initializer=paddle.nn.initializer.Normal(0.0, 0.02)
         )
         bias_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0
+            initializer=paddle.nn.initializer.Normal(0.0, 0.02),
+            learning_rate=2.0,
         )
         if fluid.framework._non_static_mode():
             h_0 = np.zeros(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 6eb0c9d6e6c0345d9cedd8273c5e4b2bbaeacee6..8917230d52c4e632ec76b48d9e8e258e9bf4a595 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -51,26 +51,26 @@ class SimpleLSTMRNN(fluid.Layer):
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -176,7 +176,7 @@ class PtbModel(fluid.Layer):
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -185,7 +185,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -193,7 +193,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index bc46ad12d3df0d7c78f1f7085f557d58ca4faa6f..2936b0730386fc82094aed9304fd25d5d6a31b92 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -226,7 +226,7 @@ class ResNet(fluid.Layer):
             self.pool2d_avg_output,
             class_dim,
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 7fd322d358366c35460f54134869041d44691981..2ef0b8afcc5c7e12b4c90cd7a2176dbeb220927a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -48,26 +48,26 @@ class SimpleLSTMRNN(fluid.Layer):
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -172,7 +172,7 @@ class PtbModel(fluid.Layer):
             sparse=False,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -182,7 +182,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -190,7 +190,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index 647710fba61f1f4a0eef33840431cfa58e9b5544..fb833c6525846b72fdb5ca93b292ce855414f18d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -48,7 +48,7 @@ class SimpleNet(fluid.Layer):
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -57,7 +57,7 @@ class SimpleNet(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.hidden_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -65,7 +65,7 @@ class SimpleNet(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index e171899289aa4be06df5dd9c02d4a1fd4c69f169..46bd8890d21da879b587f205023800f3f864f553 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -399,10 +399,10 @@ class PrePostProcessLayer(Layer):
                 self._layer_norm = paddle.nn.LayerNorm(
                     normalized_shape=d_model,
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(0.0)
+                        initializer=paddle.nn.initializer.Constant(0.0)
                     ),
                 )
 
@@ -662,7 +662,9 @@ class PrepareEncoderDecoderLayer(Layer):
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
+                initializer=paddle.nn.initializer.Normal(
+                    0.0, src_emb_dim**-0.5
+                ),
             ),
         )
 
@@ -676,7 +678,7 @@ class PrepareEncoderDecoderLayer(Layer):
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=pos_enc_param_name,
-                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
+                initializer=paddle.nn.initializer.Assign(pos_inp),
                 trainable=False,
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 07d9d7b48c29f6b777c428dc9e92a6d87de868e0..f87e62cb020981413b5bb13ef6746a76ab23ea64 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
-import paddle.fluid.initializer as initializer
 from paddle.fluid.core import VarDesc
 from paddle.regularizer import L2Decay
 
@@ -67,7 +66,7 @@ class TestConstantInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
@@ -86,7 +85,7 @@ class TestConstantInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.ConstantInitializer(2.3),
+                initializer=paddle.nn.initializer.Constant(2.3),
             )
         num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
@@ -119,7 +118,7 @@ class TestUniformInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.UniformInitializer(),
+                initializer=paddle.nn.initializer.Uniform(),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -141,14 +140,14 @@ class TestUniformInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param1",
-                initializer=initializer.UniformInitializer(),
+                initializer=paddle.nn.initializer.Uniform(),
             )
             block.create_parameter(
                 dtype="float32",
                 shape=[5, 10],
                 lod_level=0,
                 name="param2",
-                initializer=initializer.UniformInitializer(seed=456),
+                initializer=paddle.nn.initializer.UniformInitializer(seed=456),
             )
         init_op = block.ops[1]
         self.assertEqual(init_op.attr("seed"), 456)
@@ -165,7 +164,9 @@ class TestUniformInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.UniformInitializer(-4.2, 3.1, 123),
+                initializer=paddle.nn.initializer.UniformInitializer(
+                    -4.2, 3.1, 123
+                ),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -186,7 +187,9 @@ class TestUniformInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.UniformInitializer(-4.2, float(i), 123),
+                initializer=paddle.nn.initializer.UniformInitializer(
+                    -4.2, float(i), 123
+                ),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -226,7 +229,7 @@ class TestNormalInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.NormalInitializer(),
+                initializer=paddle.nn.initializer.Normal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -245,7 +248,9 @@ class TestNormalInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.NormalInitializer(2.3, 1.9, 123),
+                initializer=paddle.nn.initializer.NormalInitializer(
+                    2.3, 1.9, 123
+                ),
             )
         num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
@@ -278,7 +283,7 @@ class TestXavierInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(),
+                initializer=paddle.nn.initializer.XavierUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -300,7 +305,7 @@ class TestXavierInitializer(unittest.TestCase):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(),
+                initializer=paddle.nn.initializer.XavierUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -325,7 +330,7 @@ class TestXavierInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -347,7 +352,7 @@ class TestXavierInitializer(unittest.TestCase):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -372,7 +377,7 @@ class TestXavierInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(
+                initializer=paddle.nn.initializer.XavierInitializer(
                     uniform=uniform, fan_in=12, fan_out=23, seed=134
                 ),
             )
@@ -421,7 +426,7 @@ class TestMSRAInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(),
+                initializer=paddle.nn.initializer.KaimingUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -443,7 +448,7 @@ class TestMSRAInitializer(unittest.TestCase):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(),
+                initializer=paddle.nn.initializer.KaimingUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -466,7 +471,7 @@ class TestMSRAInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(uniform=False),
+                initializer=paddle.nn.initializer.KaimingNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -488,7 +493,7 @@ class TestMSRAInitializer(unittest.TestCase):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(uniform=False),
+                initializer=paddle.nn.initializer.KaimingNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -509,7 +514,9 @@ class TestMSRAInitializer(unittest.TestCase):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(fan_in=12, seed=134),
+                initializer=paddle.nn.initializer.MSRAInitializer(
+                    fan_in=12, seed=134
+                ),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -542,7 +549,7 @@ class TestBilinearInitializer(unittest.TestCase):
                 shape=[8, 1, 3, 3],
                 lod_level=0,
                 name="param",
-                initializer=initializer.BilinearInitializer(),
+                initializer=paddle.nn.initializer.Bilinear(),
             )
         num_ops = 2 if dtype in ["float16", "uint16", "float64"] else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -576,7 +583,7 @@ class TestBilinearInitializerDygraphAPI(unittest.TestCase):
         w_attr = paddle.ParamAttr(
             learning_rate=0.0,
             regularizer=L2Decay(0.0),
-            initializer=initializer.BilinearInitializer(),
+            initializer=paddle.nn.initializer.Bilinear(),
         )
         data = paddle.rand([B, 3, H, W], dtype='float32')
         conv_up = paddle.nn.Conv2DTranspose(
@@ -597,7 +604,7 @@ class TestBilinearInitializerDygraphAPI(unittest.TestCase):
         w_attr = paddle.ParamAttr(
             learning_rate=0.0,
             regularizer=L2Decay(0.0),
-            initializer=initializer.BilinearInitializer(),
+            initializer=paddle.nn.initializer.Bilinear(),
         )
         conv2d = paddle.nn.Conv2D(1, 2, 3, weight_attr=w_attr)
         paddle.set_default_dtype("float32")
@@ -632,7 +639,7 @@ class TestNumpyArrayInitializer(unittest.TestCase):
                 shape=np_array.shape,
                 lod_level=0,
                 name="param",
-                initializer=initializer.NumpyArrayInitializer(np_array),
+                initializer=paddle.nn.initializer.Assign(np_array),
             )
         num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -657,7 +664,9 @@ class TestSetGlobalInitializer(unittest.TestCase):
         """Test Set Global Param initilizer with UniformInitializer"""
         main_prog = framework.Program()
         startup_prog = framework.Program()
-        fluid.set_global_initializer(initializer.Uniform(low=-0.5, high=0.5))
+        fluid.set_global_initializer(
+            paddle.nn.initializer.Uniform(low=-0.5, high=0.5)
+        )
         with fluid.program_guard(main_prog, startup_prog):
             x = fluid.data(name="x", shape=[1, 3, 32, 32])
             # default initilizer of param in layers.conv2d is NormalInitializer
@@ -683,8 +692,8 @@ class TestSetGlobalInitializer(unittest.TestCase):
         main_prog = framework.Program()
         startup_prog = framework.Program()
         fluid.set_global_initializer(
-            initializer.Uniform(low=-0.5, high=0.5),
-            bias_init=initializer.Normal(loc=0.0, scale=2.0),
+            paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+            bias_init=paddle.nn.initializer.Normal(0.0, 2.0),
         )
         with fluid.program_guard(main_prog, startup_prog):
             x = fluid.data(name="x", shape=[1, 3, 32, 32])
@@ -746,9 +755,7 @@ class TestXavierInitializerDygraph(unittest.TestCase):
         tensor = paddle.zeros([1024, 1024, 16])
         tensor.stop_gradient = False
 
-        xavier_ = paddle.fluid.initializer.XavierInitializer(
-            uniform=False, fan_in=3, fan_out=5
-        )
+        xavier_ = paddle.nn.initializer.XavierNormal(fan_in=3, fan_out=5)
         xavier_(tensor)
 
         hist, _ = output_hist(tensor.numpy())
@@ -771,9 +778,7 @@ class TestMSRAInitializerDygraph(unittest.TestCase):
         tensor = paddle.zeros([1024, 1024, 16])
         tensor.stop_gradient = False
 
-        msra_ = paddle.fluid.initializer.MSRAInitializer(
-            uniform=False, fan_in=4
-        )
+        msra_ = paddle.nn.initializer.KaimingNormal(fan_in=4)
         msra_(tensor)
 
         hist, _ = output_hist(tensor.numpy())
@@ -1188,7 +1193,7 @@ class TestKaimingUniform(unittest.TestCase):
 
     def test_type_error(self):
         self.assertRaises(
-            ValueError, self.func_kaiminguniform_initializer_fan_in_zero
+            ZeroDivisionError, self.func_kaiminguniform_initializer_fan_in_zero
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index bc4ef3d386ccb2f0b0c94890b92382867bebe90f..7dcf964c41e31fc1e4c33b3c9c4ee57c50277d37 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -34,7 +34,7 @@ def fc_with_batchnorm(use_feed):
             size=200,
             activation='tanh',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 51715e2ae1ce28ecaf6fb161332bbf5144891631..192585e6c16db0dabbc1b4e635d0221aa342e23f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -138,7 +138,9 @@ class TestLayer(LayerTest):
                 name='data', shape=[3, 32, 32], dtype='float32'
             )
             linear = paddle.nn.Linear(
-                32, 4, bias_attr=fluid.initializer.ConstantInitializer(value=1)
+                32,
+                4,
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             ret = linear(t)
             static_ret = self.get_static_graph_result(
@@ -147,7 +149,9 @@ class TestLayer(LayerTest):
         with self.dynamic_graph():
             t = base.to_variable(inp)
             linear = paddle.nn.Linear(
-                32, 4, bias_attr=fluid.initializer.ConstantInitializer(value=1)
+                32,
+                4,
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_ret = linear(t)
             dy_ret_value = dy_ret.numpy()
@@ -162,7 +166,7 @@ class TestLayer(LayerTest):
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret1 = linear(inp)
 
@@ -175,7 +179,7 @@ class TestLayer(LayerTest):
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret2 = linear(inp)
 
@@ -248,7 +252,7 @@ class TestLayer(LayerTest):
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret1 = linear(inp)
 
@@ -261,7 +265,7 @@ class TestLayer(LayerTest):
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret2 = linear(inp)
 
@@ -396,7 +400,7 @@ class TestLayer(LayerTest):
                 num_filters=10,
                 filter_size=27,
                 act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             static_rlt = self.get_static_graph_result(
                 feed={'pixel': inp_np}, fetch_list=[out]
@@ -409,7 +413,7 @@ class TestLayer(LayerTest):
                 3,
                 10,
                 27,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             out = conv2d_transpose(img)
             out = paddle.nn.functional.sigmoid(out)
@@ -421,7 +425,7 @@ class TestLayer(LayerTest):
                 3,
                 10,
                 27,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_rlt = conv2d_transpose(base.to_variable(inp_np))
             dy_rlt = paddle.nn.functional.sigmoid(dy_rlt)
@@ -433,9 +437,7 @@ class TestLayer(LayerTest):
             images = np.ones([2, 3, 5, 5], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             conv2d1 = paddle.nn.Conv2DTranspose(3, 3, [2, 2])
             conv2d2 = paddle.nn.Conv2DTranspose(
@@ -503,7 +505,7 @@ class TestLayer(LayerTest):
                 data_x,
                 data_y,
                 6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
                 act='sigmoid',
             )
 
@@ -518,7 +520,7 @@ class TestLayer(LayerTest):
                 3,
                 3,
                 6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             out = btp(data_x, data_y)
             out = paddle.nn.functional.sigmoid(out)
@@ -530,7 +532,7 @@ class TestLayer(LayerTest):
                 3,
                 3,
                 6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
             dy_rlt = paddle.nn.functional.sigmoid(dy_rlt)
@@ -566,9 +568,7 @@ class TestLayer(LayerTest):
         with self.dynamic_graph():
             custom_weight = np.random.randn(6, 3, 3).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             btp1 = paddle.nn.Bilinear(3, 3, 6)
             btp2 = paddle.nn.Bilinear(3, 3, 6, weight_attr=weight_attr)
@@ -641,9 +641,7 @@ class TestLayer(LayerTest):
         with self.dynamic_graph():
             custom_weight = np.random.randn(dict_size, 32).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             emb1 = paddle.nn.Embedding(dict_size, 32, sparse=False)
             emb2 = paddle.nn.Embedding(
@@ -741,9 +739,7 @@ class TestLayer(LayerTest):
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             conv3d1 = paddle.nn.Conv3D(
                 in_channels=3, out_channels=3, kernel_size=2
@@ -798,8 +794,8 @@ class TestLayer(LayerTest):
             ret = paddle.static.nn.group_norm(
                 input=X,
                 groups=2,
-                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                param_attr=paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             static_ret = self.get_static_graph_result(
                 feed={
@@ -818,8 +814,8 @@ class TestLayer(LayerTest):
             groupNorm = paddle.nn.GroupNorm(
                 num_channels=shape[1],
                 num_groups=2,
-                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                weight_attr=paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             ret = groupNorm(X)
             static_ret2 = self.get_static_graph_result(
@@ -836,8 +832,8 @@ class TestLayer(LayerTest):
             groupNorm = paddle.nn.GroupNorm(
                 num_channels=shape[1],
                 num_groups=2,
-                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                weight_attr=paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_ret = groupNorm(base.to_variable(input))
             dy_rlt_value = dy_ret.numpy()
@@ -990,9 +986,7 @@ class TestLayer(LayerTest):
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             conv3d1 = paddle.nn.Conv3DTranspose(
                 in_channels=3,
@@ -2213,13 +2207,13 @@ class TestBook(LayerTest):
                 param_attr=fluid.ParamAttr(
                     learning_rate=1.0,
                     name="w_0",
-                    initializer=fluid.initializer.Xavier(uniform=False),
+                    initializer=paddle.nn.initializer.XavierNormal(),
                 ),
                 bias_size=[16, 10],
                 bias_attr=fluid.ParamAttr(
                     learning_rate=1.0,
                     name="b_0",
-                    initializer=fluid.initializer.Xavier(uniform=False),
+                    initializer=paddle.nn.initializer.XavierNormal(),
                 ),
                 act="relu",
             )
@@ -2238,7 +2232,7 @@ class TestBook(LayerTest):
                 rank_param_attr=fluid.ParamAttr(
                     learning_rate=1.0,
                     name="ubm_rank_param.w_0",
-                    initializer=fluid.initializer.Xavier(uniform=False),
+                    initializer=paddle.nn.initializer.XavierNormal(),
                 ),
                 max_rank=3,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_eig_op.py b/python/paddle/fluid/tests/unittests/test_linalg_eig_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d95a4f383d982237509f53fdd99569c809f696
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linalg_eig_op.py
@@ -0,0 +1,34 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestEigAPIError(unittest.TestCase):
+    def test_errors(self):
+        # The size of input in Eig should not be 0.
+        def test_0_size():
+            array = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [0, 0]), dtype='float32')
+            paddle.linalg.eig(x)
+
+        self.assertRaises(ValueError, test_0_size)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
index 82576ab1bd1bf7dd0c5ac9deefebda4b028eab20..94dc901a56d0c92d47dd95f4fe1029e4919a2571 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
@@ -278,5 +278,38 @@ class LinalgLstsqTestCaseLarge2(LinalgLstsqTestCase):
         self._input_shape_2 = (50, 300)
 
 
+class TestLinalgLstsqAPIError(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_api_errors(self):
+        def test_x_bad_shape():
+            x = paddle.to_tensor(np.random.random(size=(5)), dtype=np.float32)
+            y = paddle.to_tensor(
+                np.random.random(size=(5, 15)), dtype=np.float32
+            )
+            out = paddle.linalg.lstsq(x, y, driver='gelsy')
+
+        def test_y_bad_shape():
+            x = paddle.to_tensor(
+                np.random.random(size=(5, 10)), dtype=np.float32
+            )
+            y = paddle.to_tensor(np.random.random(size=(5)), dtype=np.float32)
+            out = paddle.linalg.lstsq(x, y, driver='gelsy')
+
+        def test_shape_dismatch():
+            x = paddle.to_tensor(
+                np.random.random(size=(5, 10)), dtype=np.float32
+            )
+            y = paddle.to_tensor(
+                np.random.random(size=(4, 15)), dtype=np.float32
+            )
+            out = paddle.linalg.lstsq(x, y, driver='gelsy')
+
+        self.assertRaises(ValueError, test_x_bad_shape)
+        self.assertRaises(ValueError, test_y_bad_shape)
+        self.assertRaises(ValueError, test_shape_dismatch)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
index 71f5c831ae4b6ddb8b87296ca6b8df1e96fdcef3..36496004b18d56b111846e22119c24a7a13271ec 100644
--- a/python/paddle/fluid/tests/unittests/test_linear.py
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -50,14 +50,14 @@ class LinearTestCase(unittest.TestCase):
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
-            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         bias_attr = fluid.ParamAttr(
             name="linear_bias",
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
-            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         linear = paddle.nn.Linear(
             2, 2, weight_attr=weight_attr, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
index 25bede0af214b8bea9b65cd3b4e59b1a4f2b0f9c..cb1b50b49a85354fc40d11eae59324c2f7547e8d 100644
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
@@ -15,7 +15,9 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
+
+from paddle.nn import functional as F
 
 
 def sigmoid_array(x):
@@ -25,6 +27,7 @@ def sigmoid_array(x):
 class TestLogLossOp(OpTest):
     def setUp(self):
         self.op_type = 'log_loss'
+        self.python_api = F.log_loss
         samples_num = 100
 
         x = np.random.random((samples_num, 1)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_logspace.py b/python/paddle/fluid/tests/unittests/test_logspace.py
index 2a0d466a600d89a813b7df95a52659f10bd67147..dee098dd5f34dc1c0d7b6669984475ea785d4adb 100644
--- a/python/paddle/fluid/tests/unittests/test_logspace.py
+++ b/python/paddle/fluid/tests/unittests/test_logspace.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 
@@ -23,6 +23,7 @@ import paddle
 class TestLogspaceOpCommonCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([0]).astype(dtype),
@@ -41,6 +42,7 @@ class TestLogspaceOpCommonCase(OpTest):
 class TestLogspaceOpReverseCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([10]).astype(dtype),
@@ -59,6 +61,7 @@ class TestLogspaceOpReverseCase(OpTest):
 class TestLogspaceOpNumOneCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([10]).astype(dtype),
@@ -77,6 +80,7 @@ class TestLogspaceOpNumOneCase(OpTest):
 class TestLogspaceOpMinusBaseCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([0]).astype(dtype),
@@ -95,6 +99,7 @@ class TestLogspaceOpMinusBaseCase(OpTest):
 class TestLogspaceOpZeroBaseCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([0]).astype(dtype),
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
index cc11e96f5a9150502d20d2b2d25872ed8603ba11..649a2e5937c3c2f09dd216c0f7443147a096759c 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
@@ -217,7 +217,7 @@ class TestEmbeddingLayerBF16ConstantInitializer(unittest.TestCase):
     """
 
     def set_initializer(self):
-        self.initializer = fluid.initializer.Constant(value=self.value)
+        self.initializer = paddle.nn.initializer.Constant(value=self.value)
 
     def setUp(self):
         self.ids_shape = [4, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
index 8cbc6242b3af985fe497f0d3f8b283d24db49175..0f6affcd26c07990dd743f79111edc0eab982681 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
@@ -84,7 +84,7 @@ class TestEmbeddingLayerBF16ConstantInitializer(unittest.TestCase):
     """
 
     def set_initializer(self):
-        self.initializer = fluid.initializer.Constant(value=self.value)
+        self.initializer = paddle.nn.initializer.Constant(value=self.value)
 
     def setUp(self):
         self.op_type = "lookup_table_v2"
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index eb9c4c60893e06507e9733567b60f9a8aff92e05..6aea5ef118c1113dce95a4ee63138897335b41b7 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
@@ -57,10 +57,10 @@ class TestLookupTableOp(OpTest):
         return "int64"
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_eager=True)
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
 class TestLookupTableOpInt16(OpTest):
@@ -81,6 +81,7 @@ class TestLookupTableOpUInt8(OpTest):
 class TestLookupTableOpWithTensorIds(OpTest):
     def setUp(self):
         self.op_type = "lookup_table_v2"
+        self.python_api = paddle.nn.functional.embedding
         table = np.random.random((17, 31)).astype("float64")
         ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int32")
         self.inputs = {'W': table, 'Ids': ids}
@@ -208,9 +209,7 @@ class TestLookupTableIsSparse(unittest.TestCase):
                 param_attr=fluid.ParamAttr(
                     name="emb_weight",
                     learning_rate=10,
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        self.w_data
-                    ),
+                    initializer=paddle.nn.initializer.Assign(self.w_data),
                 ),
                 is_sparse=is_sparse,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index ec9a4d5e5ed6acfe6425dd83ce7e43d2b0a7cca0..92ec624614a83a866c1cffade8012e1d8f7beae3 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -728,6 +728,17 @@ class TestLRScheduler(unittest.TestCase):
                 step_size_down=-1,
                 scale_mode='test',
             )
+        # check empty boundaries
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.PiecewiseDecay(boundaries=[], values=[])
+        # check non-empty boundaries but empty values
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.PiecewiseDecay(boundaries=[100, 200], values=[])
+        # check boundaries and values has same length
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=[100, 200], values=[0.5, 0.1]
+            )
 
         func_api_kwargs = [
             (
diff --git a/python/paddle/fluid/tests/unittests/test_lu_op.py b/python/paddle/fluid/tests/unittests/test_lu_op.py
index 790ebb36f6d7c24a08acd171e38d34cf96829657..3e083c76b71df512a39e620e5ee90e45265552a4 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_op.py
@@ -303,6 +303,20 @@ class TestLUAPI(unittest.TestCase):
             run_lu_static(tensor_shape, dtype)
 
 
+class TestLUAPIError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.fluid.dygraph.guard():
+            # The size of input in lu should not be 0.
+            def test_0_size():
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [0, 0, 0]), dtype='float32'
+                )
+                paddle.linalg.lu(x, get_infos=True)
+
+            self.assertRaises(ValueError, test_0_size)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 8136425595a1cfad43b77fb9f9d12d9093a86e26..e78ea74260d1e9d0d159e2b651271f226b75b435 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from eager_op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from testsuite import create_op
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.testsuite import create_op
 
 
 def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
@@ -72,6 +72,7 @@ class TestMatMulV2Op(OpTest):
         self.init_kernel_type()
         self.config()
         self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
         if self.is_bfloat16_op():
             x = np.random.random(self.x_shape).astype(np.float32)
             y = np.random.random(self.y_shape).astype(np.float32)
@@ -102,15 +103,13 @@ class TestMatMulV2Op(OpTest):
         self.outputs = {'Out': result}
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad(self):
         if core.is_compiled_with_rocm():
-            self.check_grad(
-                ['X', 'Y'], 'Out', max_relative_error=1e-2, check_eager=False
-            )
+            self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
         else:
-            self.check_grad(['X', 'Y'], 'Out', check_eager=False)
+            self.check_grad(['X', 'Y'], 'Out')
 
 
 class TestMatMulOp2(TestMatMulV2Op):
@@ -344,9 +343,7 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
-                    self.check_output_with_place(
-                        place, atol=atol, check_eager=False
-                    )
+                    self.check_output_with_place(place, atol=atol)
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -356,7 +353,6 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
                     ['X', 'Y'],
                     'Out',
                     max_relative_error=max_relative_error,
-                    check_eager=False,
                 )
 
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
@@ -562,6 +558,7 @@ class TestMatMulV2API(unittest.TestCase):
 class TestComplexMatMulOp(OpTest):
     def setUp(self):
         self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
@@ -593,7 +590,7 @@ class TestComplexMatMulOp(OpTest):
         self.grad_y = np.matmul(np.conj(self.x).T, self.grad_out)
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -601,7 +598,6 @@ class TestComplexMatMulOp(OpTest):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_x(self):
@@ -611,7 +607,6 @@ class TestComplexMatMulOp(OpTest):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -621,13 +616,13 @@ class TestComplexMatMulOp(OpTest):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
 
 class TestComplexMatMulOpBroadcast(OpTest):
     def setUp(self):
         self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
@@ -661,7 +656,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
         )
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -669,7 +664,6 @@ class TestComplexMatMulOpBroadcast(OpTest):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_x(self):
@@ -679,7 +673,6 @@ class TestComplexMatMulOpBroadcast(OpTest):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -689,7 +682,6 @@ class TestComplexMatMulOpBroadcast(OpTest):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
index 7f26a7170191fd6032c168943fb8f58ed96ed749..8296aa320f59bb2e853fda343f00cdeedc3d2660 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -32,6 +32,7 @@ class TestMatrixPowerOp(OpTest):
 
     def setUp(self):
         self.op_type = "matrix_power"
+        self.python_api = paddle.tensor.matrix_power
         self.config()
 
         np.random.seed(123)
@@ -316,6 +317,12 @@ class TestMatrixPowerAPIError(unittest.TestCase):
         input = fluid.data(name="input_4", shape=[1, 1, 0, 0], dtype="float32")
         self.assertRaises(ValueError, paddle.linalg.matrix_power, input, 2)
 
+        # The size of input should not be 0
+        input = fluid.data(name="input_5", shape=[0, 0], dtype="float32")
+        self.assertRaises(
+            ValueError, paddle.linalg.matrix_power, input, -956301312
+        )
+
 
 class TestMatrixPowerSingularAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py
index c0e9803b140f01724163412890699289c19baf82..70d41e113216936bb282e3f30fde7ba24ce98c2d 100644
--- a/python/paddle/fluid/tests/unittests/test_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -20,6 +20,7 @@ from test_sum_op import TestReduceOPTensorAxisBase
 
 import paddle
 import paddle.fluid.core as core
+from paddle import fluid
 
 
 class ApiMinTest(unittest.TestCase):
@@ -117,5 +118,18 @@ class TestMinWithTensorAxis2(TestReduceOPTensorAxisBase):
         self.keepdim = True
 
 
+class TestMinAPIWithEmptyTensor(unittest.TestCase):
+    def test_empty_tensor(self):
+        with fluid.dygraph.guard():
+            with self.assertRaises(ValueError):
+                data = np.array([], dtype=np.float32)
+                data = np.reshape(data, [0, 0, 0, 0, 0, 0, 0])
+                x = paddle.to_tensor(data, dtype='float64')
+                np_axis = np.array([0], dtype='int64')
+                tensor_axis = paddle.to_tensor(np_axis, dtype='int64')
+
+                out = paddle.min(x, tensor_axis)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
index a4dc9f33279db55cdd270e42bfbee8d730015273..bdc4af3bdcd3287376c81d37067f3ad121e019fa 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -58,10 +58,10 @@ def simple_fc_net_static():
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = image
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.8)
+                initializer=paddle.nn.initializer.Constant(value=0.8)
             )
             bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)
+                initializer=paddle.nn.initializer.Constant(value=0.5)
             )
             for hidden_size in [10, 20, 30]:
                 hidden = paddle.static.nn.fc(
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 24c008a60271f26169cc02f6ab597dfba5a50dd2..a38c77386a67ae74ed0e4ebcb4ba8593d88bb040 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -58,10 +58,10 @@ def simple_fc_net_static():
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = image
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.8)
+                initializer=paddle.nn.initializer.Constant(value=0.8)
             )
             bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)
+                initializer=paddle.nn.initializer.Constant(value=0.5)
             )
             for hidden_size in [10, 20, 30]:
                 hidden = paddle.static.nn.fc(
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index e2923da7113df8035429bad5c4cffedbf85ccb74..8a4c555ad572080ade0013185e347f1195a0743d 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -19,7 +19,6 @@ from op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.initializer as initializer
 from paddle.fluid import Program, program_guard
 
 
@@ -199,7 +198,7 @@ class TestNCECase1SelectedRows(unittest.TestCase):
                 shape=[num_total_classes, 10],
                 dtype='float32',
                 name='nce_w',
-                initializer=initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
         b_param = (
@@ -209,7 +208,7 @@ class TestNCECase1SelectedRows(unittest.TestCase):
                 shape=[num_total_classes, 1],
                 dtype='float32',
                 name='nce_b',
-                initializer=initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
 
@@ -331,6 +330,13 @@ class TestNCE_OpError(unittest.TestCase):
                 TypeError, paddle.static.nn.nce, input4, label4, 5
             )
 
+            input5 = paddle.static.data(name='x', shape=[1], dtype='float32')
+            label5 = paddle.static.data(name='label', shape=[1], dtype='int64')
+
+            self.assertRaises(
+                ValueError, paddle.static.nn.nce, input5, label5, 1
+            )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
index d89af631baa45b8829e9f38618acb1229aef02ba..95df8aa0be0ac1007787fe02c780c39872e290b1 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -27,7 +27,7 @@ class EmbeddingStatic(unittest.TestCase):
         with fluid.program_guard(prog):
 
             def test_bad_x():
-                initializer = fluid.initializer.NumpyArrayInitializer(
+                initializer = paddle.nn.initializer.Assign(
                     np.random.random(size=(128, 100))
                 )
 
@@ -59,7 +59,7 @@ class EmbeddingStatic(unittest.TestCase):
         with fluid.program_guard(prog):
 
             def test_bad_x():
-                initializer = fluid.initializer.NumpyArrayInitializer(
+                initializer = paddle.nn.initializer.Assign(
                     np.random.random(size=(128, 100))
                 )
 
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index d70d0dd9f065d9e9731c2a457cc74eab33c0f494..beff458bd1b700762f6031d390135d48049007d1 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -655,6 +655,15 @@ class API_NormTest(unittest.TestCase):
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1]
             )
 
+        with fluid.dygraph.guard():
+            # The size of input in Norm should not be 0.
+            def test_0_size():
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [0, 0]), dtype='float32')
+                paddle.linalg.norm(x, axis=0)
+
+            self.assertRaises(ValueError, test_0_size)
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 73938f2d1b1c0e23a30c158a99ac78f7d58d82db..7b4b8dc60a02af8349b71cfcc32295d9d7a60fa9 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
@@ -29,10 +29,14 @@ def l2_norm(x, axis, epsilon):
     return y, r
 
 
+def norm_wrapper(x, axis=1, epsilon=1e-12, is_test=False):
+    return paddle.nn.functional.normalize(x, axis=axis, epsilon=epsilon)
+
+
 class TestNormOp(OpTest):
     def setUp(self):
         self.op_type = "norm"
-        self.python_api = paddle.nn.functional.normalize
+        self.python_api = norm_wrapper
         self.init_test_case()
         self.init_dtype()
         x = np.random.random(self.shape).astype(self.dtype)
@@ -40,6 +44,7 @@ class TestNormOp(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
         self.outputs = {'Out': y, 'Norm': norm}
+        self.python_out_sig = ['Out']
 
     def test_check_output(self):
         self.check_output()
@@ -126,19 +131,22 @@ class TestNormOp7(TestNormOp):
 class TestNormTestOp(OpTest):
     def setUp(self):
         self.op_type = "norm"
+        self.python_api = norm_wrapper
         self.init_test_case()
         x = np.random.random(self.shape).astype("float64")
         y, norm = l2_norm(x, self.axis, self.epsilon)
         self.inputs = {'X': x}
         self.attrs = {
             'epsilon': self.epsilon,
-            'axis': self.axis,
+            'axis': int(self.axis),
             'is_test': True,
         }
         self.outputs = {'Out': y}
+        self.python_out_sig = ["out"]
 
     def test_check_output(self):
-        self.check_output()
+        # dynamic graph just supports float tensor
+        self.check_output(check_dygraph=True)
 
     def test_check_grad(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/test_normalize.py b/python/paddle/fluid/tests/unittests/test_normalize.py
index 908360d29e692d03f17fb09aa9454747876fe864..a986dd90415d5125cfcbf799503b1432a0e2ecba 100644
--- a/python/paddle/fluid/tests/unittests/test_normalize.py
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
@@ -97,6 +97,18 @@ class TestNNFunctionalNormalize(unittest.TestCase):
         with fluid.program_guard(fluid.Program()):
             self.run_static(use_gpu=True)
 
+    def test_errors(self):
+        with fluid.dygraph.guard():
+            # The size of input in Normalize should not be 0.
+            def test_0_size():
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 0]), dtype='float32'
+                )
+                paddle.nn.functional.normalize(x)
+
+            self.assertRaises(ValueError, test_0_size)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_npscaler_to_tensor.py b/python/paddle/fluid/tests/unittests/test_npscaler_to_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..da6569d7d29730f051b2f966a2a28b94fa3cdff4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_npscaler_to_tensor.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+DTYPE_MAP = {
+    paddle.bool: np.bool_,
+    paddle.int32: np.int32,
+    paddle.int64: np.int64,
+    paddle.float16: np.float16,
+    paddle.float32: np.float32,
+    paddle.float64: np.float64,
+    paddle.complex64: np.complex64,
+}
+
+
+class NumpyScaler2Tensor(unittest.TestCase):
+    def setUp(self):
+        self.dtype = np.float32
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+    def test_dynamic_scaler2tensor(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np)
+        self.assertEqual(DTYPE_MAP[x.dtype], self.dtype)
+        self.assertEqual(x.numpy(), self.x_np)
+        if self.dtype in [
+            np.bool_
+        ]:  # bool is not supported convert to 0D-Tensor
+            return
+        self.assertEqual(len(x.shape), 0)
+
+    def test_static_scaler2tensor(self):
+        if self.dtype in [np.float16, np.complex64]:
+            return
+        paddle.enable_static()
+        x = paddle.to_tensor(self.x_np)
+        self.assertEqual(DTYPE_MAP[x.dtype], self.dtype)
+        if self.dtype in [
+            np.bool_,
+            np.float64,
+        ]:  # bool is not supported convert to 0D-Tensor and float64 not supported in static mode
+            return
+        self.assertEqual(len(x.shape), 0)
+
+
+class NumpyScaler2TensorBool(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.bool_
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorFloat16(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.float16
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorFloat64(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.float64
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorInt32(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.int32
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorInt64(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.int64
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorComplex64(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.complex64
+        self.x_np = np.array([1], dtype=self.dtype)[0]
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index 1878c8409f5a38e5dbd05b328d2c149efd3405bd..a2414ed369b9b28695409da3eb81d7f400035f44 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -24,6 +24,7 @@ import paddle.fluid as fluid
 class TestNumelOp(OpTest):
     def setUp(self):
         self.op_type = "size"
+        self.python_api = paddle.numel
         self.init()
         x = np.random.random((self.shape)).astype("float64")
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 30bb75e0fa7833ef213a9c9547fea0761aab940c..5d78b371b5fe9b2fc4bcce6d182261118b6079c2 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -23,9 +23,15 @@ import paddle.fluid.core as core
 from paddle.fluid.framework import Program, program_guard
 
 
+def one_hot_wrapper(x, depth_tensor, **keargs):
+    return paddle.nn.functional.one_hot(x, depth_tensor)
+
+
 class TestOneHotOp(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
+        self.python_out_sig = ['Out']
         depth = 10
         depth_np = np.array(10).astype('int32')
         dimension = 12
@@ -49,6 +55,7 @@ class TestOneHotOp(OpTest):
 class TestOneHotOp_attr(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
@@ -73,6 +80,7 @@ class TestOneHotOp_attr(OpTest):
 class TestOneHotOp_default_dtype(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
         depth = 10
         depth_np = np.array(10).astype('int32')
         dimension = 12
@@ -96,6 +104,7 @@ class TestOneHotOp_default_dtype(OpTest):
 class TestOneHotOp_default_dtype_attr(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
index fc5fbec82cd0acb7d38d85a292069333727337b0..626521577d173a6011c9fd508a6d083f9326023b 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
@@ -81,20 +81,20 @@ class SimpleNetWithCond:
             dtype="float32",
             shape=self.shape,
             attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_x"),
-            default_initializer=fluid.initializer.NumpyArrayInitializer(self.x),
+            default_initializer=paddle.nn.initializer.Assign(self.x),
         )
 
         param_y = paddle.create_parameter(
             dtype="float32",
             shape=self.shape,
             attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_y"),
-            default_initializer=fluid.initializer.NumpyArrayInitializer(self.y),
+            default_initializer=paddle.nn.initializer.Assign(self.y),
         )
         param_z = paddle.create_parameter(
             dtype="float32",
             shape=self.shape,
             attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_z"),
-            default_initializer=fluid.initializer.NumpyArrayInitializer(self.z),
+            default_initializer=paddle.nn.initializer.Assign(self.z),
         )
 
         sum_xy = paddle.add(param_x, param_y, name='sum_xy')
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index 3b32c9ca4ee7886dac1aa0f90786227b2d57507e..ab9b99d8cb2499b10edbb8a76e4be5f0ffe0daea 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -51,10 +51,10 @@ def static(
                 size=FC_SIZE,
                 activation='relu',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.99)
+                    initializer=paddle.nn.initializer.Constant(value=0.99)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.5)
+                    initializer=paddle.nn.initializer.Constant(value=0.5)
                 ),
                 name="hidden",
             )
@@ -64,10 +64,10 @@ def static(
                 size=CLASS_NUM,
                 activation='softmax',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=1.2)
+                    initializer=paddle.nn.initializer.Constant(value=1.2)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.8)
+                    initializer=paddle.nn.initializer.Constant(value=0.8)
                 ),
                 name="prediction",
             )
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 850ddc379c6092738c1deb7197d9a130258b52c9..1f6429620f6898f579d4297130b3cd799b10964c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -33,7 +33,7 @@ def simple_fc_net(use_feed):
             size=200,
             activation='tanh',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
@@ -56,7 +56,7 @@ def fc_with_batchnorm(use_feed):
                 size=200,
                 activation='tanh',
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=1.0)
+                    initializer=paddle.nn.initializer.Constant(value=1.0)
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
index 5ce6f313183959ad0da93da3f7df3b0f18d690fa..909feb2a48ff31c60269ae4f950a4348dbce209e 100644
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
@@ -23,7 +23,6 @@ import paddle.fluid.io as io
 from paddle.fluid.dygraph import guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import ParamBase, Variable, default_main_program
-from paddle.fluid.initializer import ConstantInitializer
 
 paddle.enable_static()
 main_program = default_main_program()
@@ -38,7 +37,7 @@ class ParameterChecks(unittest.TestCase):
             name='fc.w',
             shape=shape,
             dtype='float32',
-            initializer=ConstantInitializer(val),
+            initializer=paddle.nn.initializer.Constant(val),
         )
         self.assertIsNotNone(param)
         self.assertEqual('fc.w', param.name)
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index 9600f5a872c56a7734826b457f8a6eb5b916cc92..0ef6b3e77824ba8402d7add0b7c0adb4c5bed6f8 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -85,10 +85,13 @@ class TestPixelShuffleOp(OpTest):
         self.format = "NCHW"
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_eager=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+        )
 
 
 class TestChannelLast(TestPixelShuffleOp):
diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
index e2720edb013130de8aaeab38d06c6effd43276cd..ee66d578014c70395ec3525f8118d2780886458c 100644
--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -16,7 +16,7 @@ import math
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 
@@ -41,6 +41,7 @@ def output_hist(out, lam, a, b):
 class TestPoissonOp1(OpTest):
     def setUp(self):
         self.op_type = "poisson"
+        self.python_api = paddle.tensor.poisson
         self.config()
 
         self.attrs = {}
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 2c191bf4892b774065b1b390841bac009016b2df..e6712358696633487bb60cfe436b4f0a50339da3 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -274,7 +274,7 @@ class TestPool1D_API(unittest.TestCase):
             self.check_max_dygraph_return_index_results(place)
 
 
-class TestPool2DError_API(unittest.TestCase):
+class TestPool1DError_API(unittest.TestCase):
     def test_error_api(self):
         def run1():
             with fluid.dygraph.guard():
@@ -417,6 +417,28 @@ class TestPool2DError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
+        def run_zero_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1]), dtype='float32'
+                )
+                out = F.max_pool1d(
+                    x, 1, stride=0, padding=1, return_mask=True, ceil_mode=True
+                )
+
+        self.assertRaises(ValueError, run_zero_stride)
+
+        def run_zero_tuple_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1]), dtype='float32'
+                )
+                out = F.max_pool1d(x, 1, stride=(0))
+
+        self.assertRaises(ValueError, run_zero_tuple_stride)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 44ef18605ed2aed1f9d48a37d77b51e109cb0095..fcdec610a480ebff0dfc3cd9372fe1a34ee4288c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -597,6 +597,30 @@ class TestPool2DError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
+        def run_zero_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1, 1]), dtype='float32'
+                )
+                out = max_pool2d(
+                    x, 1, stride=0, padding=1, return_mask=True, ceil_mode=True
+                )
+
+        self.assertRaises(ValueError, run_zero_stride)
+
+        def run_zero_tuple_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1, 1]), dtype='float32'
+                )
+                out = max_pool2d(
+                    x, 1, stride=(0, 0), return_mask=False, data_format='NHWC'
+                )
+
+        self.assertRaises(ValueError, run_zero_tuple_stride)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index 961f0b5c569f043f42705dd10719838229c89e02..e1d7543e7bce0dd5374faa29cfdee4859dfdc598 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -563,6 +563,28 @@ class TestPool3DError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run_size_out_of_range)
 
+        def run_zero_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1, 1, 1]), dtype='float32'
+                )
+                out = max_pool3d(
+                    x, 1, stride=0, padding=1, return_mask=True, ceil_mode=True
+                )
+
+        self.assertRaises(ValueError, run_zero_stride)
+
+        def run_zero_tuple_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1, 1, 1]), dtype='float32'
+                )
+                out = max_pool3d(x, 1, stride=(0, 0, 0), ceil_mode=False)
+
+        self.assertRaises(ValueError, run_zero_tuple_stride)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 9c95d5b946ce49508b190ab00be1747efed2798b..4a4d5921bbb941f65d3e3fadbf7971a0154a40fb 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -153,7 +153,7 @@ class TestNNPReluAPI(unittest.TestCase):
         x = paddle.to_tensor(self.x_np)
         m = paddle.nn.PReLU(
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(0.5)
+                initializer=paddle.nn.initializer.Constant(0.5)
             )
         )
         out = m(x)
@@ -438,7 +438,7 @@ def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
         shape=alpha_shape,
         dtype='float32',
         is_bias=False,
-        default_initializer=fluid.initializer.ConstantInitializer(0.25),
+        default_initializer=paddle.nn.initializer.Constant(0.25),
     )
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index 5364dcaa6e14a3574fa1d776f9d9706c613bc542..885c8fa829aa98d1cd95c0d13c0312bd83ad067d 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -40,7 +40,7 @@ def simple_fc_net_with_accuracy(use_feed):
             size=200,
             activation='relu',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
diff --git a/python/paddle/fluid/tests/unittests/test_prune.py b/python/paddle/fluid/tests/unittests/test_prune.py
index 30e3aefe0a738ca2af431aa89ec32a6db4ae5d19..a93516da417a435e563d1d288890473c88dc7645 100644
--- a/python/paddle/fluid/tests/unittests/test_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_prune.py
@@ -170,7 +170,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
         w_param_attrs = fluid.ParamAttr(
             name="fc_weight",
             learning_rate=0.5,
-            initializer=fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=True,
         )
         y = paddle.static.nn.fc(
@@ -198,13 +198,13 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
         w1_param_attrs = fluid.ParamAttr(
             name="fc_weight1",
             learning_rate=0.5,
-            initializer=fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=True,
         )
         w2_param_attrs = fluid.ParamAttr(
             name="fc_weight2",
             learning_rate=0.5,
-            initializer=fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=True,
         )
         y1 = paddle.static.nn.fc(
diff --git a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
index 3b2cf82fbfd391e9218b641944e8f0c9e7b3388d..7470dae1846ab353e31b6a113e93addc4481e0c3 100644
--- a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
@@ -16,7 +16,7 @@ import copy
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 from paddle.framework import core
@@ -30,6 +30,7 @@ class TestPutAlongAxisOp(OpTest):
         self.reduce_op = "assign"
         self.dtype = 'float64'
         self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
         self.xnp = np.random.random(self.x_shape).astype(self.x_type)
         # numpy put_along_axis is an inplace opearion.
         self.xnp_result = copy.deepcopy(self.xnp)
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index a90e37a4755c23632e9f54a2b2dbdffbca135d6f..526e08e9d5940da750eabdaf9658a35144cf321e 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -79,7 +79,7 @@ def simple_fc_net(img, label, use_py_func_op):
             hidden,
             size=200,
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
         if not use_py_func_op:
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 856b2be783d36cb49ae9eaf60a78ca29cdd35b73..0798fa8864f42c73dc19102e943253958d917ccc 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -378,15 +378,15 @@ class TestGeneratorSeed(unittest.TestCase):
             result_1 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
             result_2 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 4ea5ed0e0d35fdce9b11f7e3df5402d172c67ea4..8991b143846c4a2b87b66bc763e787daab1912d7 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -301,7 +301,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='W',
-                    initializer=fluid.initializer.ConstantInitializer(1.0),
+                    initializer=paddle.nn.initializer.Constant(1.0),
                 ),
                 bias_attr=False,
             )
@@ -310,7 +310,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='U',
-                    initializer=fluid.initializer.ConstantInitializer(0.0),
+                    initializer=paddle.nn.initializer.Constant(0.0),
                 ),
                 bias_attr=False,
             )
@@ -686,7 +686,7 @@ class RecurrentOpStopGradientTest(RecurrentOpTest1):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name="W",
-                    initializer=fluid.initializer.ConstantInitializer(1.0),
+                    initializer=paddle.nn.initializer.Constant(1.0),
                 ),
                 bias_attr=False,
             )
@@ -695,7 +695,7 @@ class RecurrentOpStopGradientTest(RecurrentOpTest1):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name="U",
-                    initializer=fluid.initializer.ConstantInitializer(0.0),
+                    initializer=paddle.nn.initializer.Constant(0.0),
                 ),
                 bias_attr=False,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index d160a9982577fbc2183426dc0d628aca54a8e520..408a5f8a7405e0c6b2b6af5e0a686db3f1ae99fd 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -197,7 +197,7 @@ class TestRowConvLayer(unittest.TestCase):
                 out = paddle.static.nn.row_conv(
                     x,
                     self.context_length,
-                    param_attr=fluid.initializer.NumpyArrayInitializer(self.w),
+                    param_attr=paddle.nn.initializer.Assign(self.w),
                 )
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index fe012ded3993e2c861b2cb8d5281bda9ea9291e9..35c31deb3f8c542dd3dbe9ce2171da0b83bf76da 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -131,7 +131,7 @@ class RunProgramOpTest(unittest.TestCase):
         forward_program = _add_build_strategy_for(program, 0, forward_op_num)
         backward_program = _add_build_strategy_for(
             program,
-            forward_op_num + 2 * output_num,
+            forward_op_num + output_num,
             program.desc.block(0).op_size(),
         )
         return forward_program.desc, backward_program.desc
@@ -403,7 +403,7 @@ class TestRunProgramOpWithFC(RunProgramOpTest):
         weight_attr = fluid.ParamAttr(
             name=self.input_names['Params'][0],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][0]]
             ),
             trainable=True,
@@ -411,7 +411,7 @@ class TestRunProgramOpWithFC(RunProgramOpTest):
         bias_attr = fluid.ParamAttr(
             name=self.input_names['Params'][1],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][1]]
             ),
             trainable=True,
@@ -469,7 +469,7 @@ class TestRunProgramOpWithEmbedding(RunProgramOpTest):
             param_attr=fluid.ParamAttr(
                 name="emb_weight",
                 learning_rate=10,
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     self.inputs['Params'][self.input_names['Params'][0]]
                 ),
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
index c599f08ae2bf99b7747a007ec0fdb6aadfc81b0a..3424d393952b3c6a34af97b510524ab31c8650b1 100644
--- a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
+++ b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
@@ -26,11 +26,11 @@ class TestAttrSet(unittest.TestCase):
         )
         param_attr = fluid.ParamAttr(
             name='batch_norm_w',
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         bias_attr = fluid.ParamAttr(
             name='batch_norm_b',
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
         bn = paddle.static.nn.batch_norm(
             input=x, param_attr=param_attr, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index 89515c931c2504c8fc2d915308f3dc9f3b069bdc..c63be2c6f2be84ce35ecfea63eb87175f7007589 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -322,7 +322,7 @@ class TestSGDOpBF16API(unittest.TestCase):
                 print(e)
 
     def _set_initializer(self):
-        self.initializer = fluid.initializer.Constant(value=self.value)
+        self.initializer = paddle.nn.initializer.Constant(value=self.value)
 
     def _data_reader(self):
         for sample in range(self.sample_count):
diff --git a/python/paddle/fluid/tests/unittests/test_shard_index_op.py b/python/paddle/fluid/tests/unittests/test_shard_index_op.py
index dfbb98a791372562d81c24ceca7285a8fdc4f63b..77cbecd641c14ca6730c334a11b796ea93311519 100644
--- a/python/paddle/fluid/tests/unittests/test_shard_index_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shard_index_op.py
@@ -15,11 +15,14 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
+
+import paddle
 
 
 def common_setup(self, index_num, nshards, shard_id, ignore_value):
     self.op_type = 'shard_index'
+    self.python_api = paddle.tensor.shard_index
     x_lod = [[i for i in range(10)]]
     N = sum(x_lod[0])
     x = [np.random.randint(0, index_num - 1) for i in range(N)]
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index 7834736260d9dabd7b769f29f8416bcb6384b43e..1f7d3b8228a77eccf4f84b14e7ae056bbd458e35 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -28,6 +28,7 @@ from paddle.fluid import Program, program_guard
 class TestSignOp(OpTest):
     def setUp(self):
         self.op_type = "sign"
+        self.python_api = paddle.sign
         self.inputs = {
             'X': np.random.uniform(-10, 10, (10, 10)).astype("float64")
         }
diff --git a/python/paddle/fluid/tests/unittests/test_size_op.py b/python/paddle/fluid/tests/unittests/test_size_op.py
index b3ae19b8ef20eb5528b81dbc478cc564c1e87e0c..edea44abf089057ac068bc4dd635b673d3e3b3f2 100644
--- a/python/paddle/fluid/tests/unittests/test_size_op.py
+++ b/python/paddle/fluid/tests/unittests/test_size_op.py
@@ -15,15 +15,20 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
 
 
+def size_wrapper(input):
+    return paddle.numel(paddle.to_tensor(input))
+
+
 class TestSizeOp(OpTest):
     def setUp(self):
         self.op_type = "size"
+        self.python_api = size_wrapper
         self.shape = []
         self.config()
         input = np.zeros(self.shape, dtype='bool')
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 19aa669badf5c48bfc0720c1a053fd6f5bd50bde..157818e794301249bcd2ee5b7f7948dc39dad7c1 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -852,6 +852,27 @@ class TestInferShape(unittest.TestCase):
                 paddle.slice(x, 0, starts, ends)
 
 
+class TestSliceOpError(unittest.TestCase):
+    def test_dismatch_shape(self):
+        with fluid.dygraph.guard():
+            with self.assertRaises(ValueError):
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [0]), dtype='float32')
+                paddle.slice(x, axes=[0], starts=[], ends=[])
+
+            with self.assertRaises(ValueError):
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [0]), dtype='float32')
+                paddle.slice(x, axes=[0], starts=[0], ends=[])
+
+            # if shape match, pass
+            array = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [0]), dtype='float32')
+            out = paddle.slice(x, axes=[0], starts=[0], ends=[0])
+            self.assertEqual(out.numel(), 0)
+            # self.assertEqual(out.shape)
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 290d72b2485b2ede9967f99c5b3e58464f4b75ad..8696cc532820f7946c03a2e3fcf34c3ae520b302 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid as fluid
@@ -43,6 +43,12 @@ def ref_softmax(x, axis=None, dtype=None):
     return np.apply_along_axis(stable_softmax, axis, x_t)
 
 
+def softmax_wrapper(
+    x, axis=-1, dtype=None, name=None, use_cudnn=False, use_mkldnn=False
+):
+    return paddle.nn.functional.softmax(x, axis=axis, dtype=dtype)
+
+
 class TestSoftmaxOp(OpTest):
     def get_x_shape(self):
         return [10, 10]
@@ -52,6 +58,7 @@ class TestSoftmaxOp(OpTest):
 
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = False
         self.use_mkldnn = False
         # explicilty use float32 for ROCm, as MIOpen does not yet support float64
@@ -109,6 +116,7 @@ class TestSoftmaxOp(OpTest):
 class TestSoftmaxOp_ZeroDim1(TestSoftmaxOp):
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = False
         self.use_mkldnn = False
         # explicilty use float32 for ROCm, as MIOpen does not yet support float64
@@ -133,6 +141,7 @@ class TestSoftmaxOp_ZeroDim1(TestSoftmaxOp):
 class TestSoftmaxOp_ZeroDim2(TestSoftmaxOp):
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = True
         self.use_mkldnn = False
         # explicilty use float32 for ROCm, as MIOpen does not yet support float64
@@ -366,6 +375,7 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
 class TestSoftmaxBF16Op(OpTest):
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = self.init_cudnn()
         self.use_mkldnn = False
         self.dtype = np.uint16
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_embedding_op.py b/python/paddle/fluid/tests/unittests/test_sparse_embedding_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e0beda67971ecaeca6d0eef7e59f367c6fd5e4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_embedding_op.py
@@ -0,0 +1,37 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+
+class TestSparseEmbeddingAPIError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.fluid.dygraph.guard():
+            # The size of input in sparse_embedding should not be 0.
+            def test_0_size():
+                input = paddle.to_tensor([], dtype='int64')
+                paddle.static.nn.sparse_embedding(
+                    input,
+                    [2097152, 2097152, 2097152, 2097152],
+                    padding_idx=2097152,
+                )
+
+            self.assertRaises(ValueError, test_0_size)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
index 033ee7908866d7468e773e68befe519f560cd68b..c60780f90c49bcf0569eb5802837c7b9589be247 100644
--- a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
@@ -15,9 +15,10 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
+from paddle import _C_ops
 from paddle.fluid.framework import Program, program_guard
 
 paddle.enable_static()
@@ -47,6 +48,10 @@ def spectral_norm(weight, u, v, dim, power_iters, eps):
     return weight / sigma
 
 
+def spectral_norm_wrapper(weight, u, v, dim, power_iters, eps):
+    return _C_ops.spectral_norm(weight, u, v, dim, power_iters, eps)
+
+
 @skip_check_grad_ci(
     reason="Spectral norm do not check grad when power_iters > 0 "
     "because grad is not calculated in power iterations, "
@@ -56,6 +61,7 @@ class TestSpectralNormOpNoGrad(OpTest):
     def setUp(self):
         self.initTestCase()
         self.op_type = 'spectral_norm'
+        self.python_api = spectral_norm_wrapper
         weight = np.random.random(self.weight_shape).astype('float64')
         u = np.random.normal(0.0, 1.0, self.u_shape).astype('float64')
         v = np.random.normal(0.0, 1.0, self.v_shape).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 40e7bff55e0bcc85f97c4f658bf91433ed083e07..d250302165bcbdc936a6e87627586c72f4eae3f5 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid as fluid
@@ -24,6 +24,8 @@ from paddle.fluid import Program, core, program_guard
 
 class TestSplitOp(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         axis = 1
@@ -62,6 +64,8 @@ class TestSplitOp(OpTest):
 # test with attr(num)
 class TestSplitOp_2(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
@@ -98,6 +102,8 @@ class TestSplitOp_2(OpTest):
 # attr(axis) is Tensor
 class TestSplitOp_AxisTensor(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
@@ -133,6 +139,8 @@ class TestSplitOp_AxisTensor(OpTest):
 # attr(sections) is list containing Tensor
 class TestSplitOp_SectionsTensor(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
@@ -178,6 +186,8 @@ class TestSplitOp_SectionsTensor(OpTest):
 
 class TestSplitOp_unk_section(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index d043e3785c4984fd3a1fdfd70a40109dc8bb58ca..0ac2644d90a112f2937b4167676df90e20b153e3 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -59,26 +59,26 @@ class SimpleLSTMRNN(fluid.Layer):
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -184,7 +184,7 @@ class PtbModel(fluid.Layer):
             embedding_dim=hidden_size,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -193,7 +193,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -201,7 +201,7 @@ class PtbModel(fluid.Layer):
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 6e9ff86cb8b7f83ad52c747375dd285ae989b2bc..b712b0bb161f6133f12282a8ee244f579096789e 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -19,6 +19,11 @@ import unittest
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
 import paddle.fluid as fluid
@@ -26,16 +31,19 @@ import paddle.fluid.core as core
 import paddle.inference as paddle_infer
 from paddle import enable_static
 from paddle.fluid.op import Operator
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+
+
+def sum_wrapper(X, use_mkldnn=False):
+    res = 0
+    for x in X:
+        res += x
+    return res
 
 
 class TestSumOp(OpTest):
     def setUp(self):
         self.op_type = "sum"
+        self.python_api = sum_wrapper
         self.init_kernel_type()
         self.use_mkldnn = False
         self.init_kernel_type()
@@ -341,10 +349,14 @@ class TestSumBF16Op(OpTest):
         self.dtype = np.uint16
 
     def test_check_output(self):
-        self.check_output()
+        # new dynamic graph mode does not support unit16 type
+        self.check_output(check_dygraph=False)
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5)
+        # new dynamic graph mode does not support unit16 type
+        self.check_grad(
+            ['x0'], 'Out', numeric_grad_delta=0.5, check_dygraph=False
+        )
 
 
 class API_Test_Add_n(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_svd_op.py b/python/paddle/fluid/tests/unittests/test_svd_op.py
index cf91162d9d8ca4200681b2efac320c594884a63d..a760fef4ff2528bef558c1a895c6e383b0cf16b4 100644
--- a/python/paddle/fluid/tests/unittests/test_svd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_svd_op.py
@@ -320,6 +320,16 @@ class TestSvdAPI(unittest.TestCase):
                 )
                 np.testing.assert_allclose(fetches[0], gt_s, rtol=1e-05)
 
+    def test_errors(self):
+        with paddle.fluid.dygraph.guard():
+            # The size of input in svd should not be 0.
+            def test_0_size():
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [0, 0]), dtype='float32')
+                paddle.linalg.svd(x, full_matrices=False)
+
+            self.assertRaises(ValueError, test_0_size)
+
 
 if __name__ == "__main__":
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
index da3fa64417fe609f51c59790f962abd8100faccd..7abd86d19f676ae7abab5e7cbc5dbaa6051572ef 100644
--- a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 from paddle.framework import core
@@ -27,6 +27,7 @@ class TestTakeAlongAxisOp(OpTest):
     def setUp(self):
         self.init_data()
         self.op_type = "take_along_axis"
+        self.python_api = paddle.tensor.take_along_axis
         self.xnp = np.random.random(self.x_shape).astype(self.x_type)
         self.target = np.take_along_axis(self.xnp, self.index, self.axis)
         broadcast_shape_list = list(self.x_shape)
diff --git a/python/paddle/fluid/tests/unittests/test_tdm_child_op.py b/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
index 2481a48f01793cdeb8ef30b528589447fe76f6fd..eaa139714660f98119303a43803a7117f5bd6efe 100644
--- a/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
@@ -151,9 +151,7 @@ class TestTDMChildShape(unittest.TestCase):
             node_nums=26,
             child_nums=2,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    tree_info_np
-                )
+                initializer=paddle.nn.initializer.Assign(tree_info_np)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
index 217d84b4b9f8afc2d9a52adec74f50ab3e07e6d5..c54c6c0c9de028d03eccc08f6725d1bfa8ede2d4 100644
--- a/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
@@ -290,12 +290,10 @@ class TestTDMSamplerShape(unittest.TestCase):
             layer_node_num_list,
             leaf_node_num,
             tree_travel_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    travel_array
-                )
+                initializer=paddle.nn.initializer.Assign(travel_array)
             ),
             tree_layer_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(layer_array)
+                initializer=paddle.nn.initializer.Assign(layer_array)
             ),
             output_positive=True,
             output_list=True,
diff --git a/python/paddle/fluid/tests/unittests/test_unbind_op.py b/python/paddle/fluid/tests/unittests/test_unbind_op.py
index 6ec82a96bc16534572d59eb81c690839681be557..8cafc1b5a8e1b7d65d3e7c57f2964de7958dbf59 100644
--- a/python/paddle/fluid/tests/unittests/test_unbind_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unbind_op.py
@@ -25,6 +25,7 @@ from paddle.fluid import Program, program_guard
 
 class TestUnbind(unittest.TestCase):
     def test_unbind(self):
+        paddle.enable_static()
 
         x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1')
         [out_0, out_1] = tensor.unbind(input=x_1, axis=0)
@@ -59,6 +60,7 @@ class TestUnbind(unittest.TestCase):
 
 class TestLayersUnbind(unittest.TestCase):
     def test_layers_unbind(self):
+        paddle.enable_static()
 
         x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1')
         [out_0, out_1] = paddle.unbind(input=x_1, axis=0)
@@ -214,6 +216,11 @@ class TestUnbindAxisError(unittest.TestCase):
 
             self.assertRaises(TypeError, test_table_Variable)
 
+            def test_invalid_axis():
+                tensor.unbind(input=x, axis=2)
+
+            self.assertRaises(ValueError, test_invalid_axis)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index c31d763dbff7c50d085d8d73bc93ab7ec8e4e089..407d70b4dadf3aa7effbabb99e769b904db9f957 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -286,7 +286,7 @@ class TestUniformRandomOpApi(unittest.TestCase):
         y = paddle.static.nn.fc(
             x,
             size=16,
-            weight_attr=fluid.initializer.Uniform(
+            weight_attr=paddle.nn.initializer.UniformInitializer(
                 low=-0.5,
                 high=0.5,
                 seed=10,
diff --git a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
index 86872aff9c7da308ee1d2a1f08962d63a4999ba8..1cf7714844ce80cb634f1a03c831049707f9be27 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
@@ -22,7 +22,9 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 
 
-def reference_unique_consecutive(X, return_inverse=False, return_counts=False):
+def reference_unique_consecutive(
+    X, return_inverse=False, return_counts=False, axis=None
+):
     """
     Reference unique_consecutive implementation using python.
     Args:
@@ -32,12 +34,14 @@ def reference_unique_consecutive(X, return_inverse=False, return_counts=False):
         return_counts(bool, optional): If True, also return the counts for each unique consecutive element.
     """
     X = list(X)
+    is_empty = len(X) == 0
     counts_vec = [1] * len(X)
     i = 0
     counts = 1
     last = 0
     inverse_vec = [0] * len(X)
-    inverse_vec[last] = i
+    if not is_empty:
+        inverse_vec[last] = i
     cnt = 0
     while i < len(X) - 1:
         if X[i] == X[i + 1]:
@@ -271,6 +275,81 @@ class TestUniqueConsecutiveCase2API(unittest.TestCase):
                 )
 
 
+class TestUniqueConsecutiveCase3API(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            paddle.enable_static()
+            input_x = fluid.data(
+                name="input_x",
+                shape=[
+                    100,
+                ],
+                dtype="float32",
+            )
+            result, inverse, counts = paddle.unique_consecutive(
+                input_x, return_inverse=True, return_counts=True, axis=-1
+            )
+            x_np = np.random.randint(20, size=100).astype("float32")
+            exe = fluid.Executor(place)
+            fetches = exe.run(
+                fluid.default_main_program(),
+                feed={"input_x": x_np},
+                fetch_list=[result],
+            )
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_x = np.random.randint(20, size=100).astype("float64")
+                x = paddle.to_tensor(input_x)
+                result, inverse, counts = paddle.unique_consecutive(
+                    x, return_inverse=True, return_counts=True, axis=-1
+                )
+
+
+class TestUniqueConsecutiveEmptyInput(OpTest):
+    """empty input"""
+
+    def config(self):
+        self.return_inverse = True
+        self.return_counts = True
+        self.python_api = paddle.unique_consecutive
+
+    def init_kernel_type(self):
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+
+    def setUp(self):
+        self.init_kernel_type()
+        self.config()
+        self.op_type = "unique_consecutive"
+        x = np.array([]).astype(self.dtype)
+        result = reference_unique_consecutive(
+            x, self.return_inverse, self.return_counts
+        )
+        out = reference_unique_consecutive(x)
+        out = np.array(out).astype(self.dtype)
+        self.inputs = {
+            'X': x,
+        }
+        self.python_out_sig = ["Out"]
+        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
+        self.outputs = {
+            'Out': out,
+        }
+
+    def test_check_output(self):
+        self.check_output(check_eager=True)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
index abbabb43a51850306c4242b7700f38a68718d7be..aa4da0b7c107857eabc4c296074fd86d194a5970 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
@@ -195,6 +195,20 @@ class TestUnpool3DOpException(unittest.TestCase):
             ).astype("int32")
             F.max_unpool3d(data, indices, kernel_size=2, stride=2)
 
+        def x_rank_error():
+            data = paddle.rand(shape=[1, 1, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3]
+            ).astype("int32")
+            F.max_unpool3d(data, indices, kernel_size=2, stride=2)
+
+        def indices_rank_error():
+            data = paddle.rand(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 3, 3, 3]
+            ).astype("int32")
+            F.max_unpool3d(data, indices, kernel_size=2, stride=2)
+
         def indices_value_error():
             data = paddle.rand(shape=[1, 1, 3, 3, 3])
             indices = paddle.reshape(
@@ -238,6 +252,16 @@ class TestUnpool3DOpException(unittest.TestCase):
             r"The dimensions of Input\(X\) must equal to",
             indices_size_error,
         )
+        self.assertRaisesRegex(
+            ValueError,
+            r"The x should have \[N, C, D, H, W\] format",
+            x_rank_error,
+        )
+        self.assertRaisesRegex(
+            ValueError,
+            r"The indices should have \[N, C, D, H, W\] format",
+            indices_rank_error,
+        )
         if not core.is_compiled_with_cuda():
             self.assertRaisesRegex(
                 ValueError,
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index 4963f93c67e77852164d0b299eafc6dddbee16e6..e5eefc067e89206f518de39b5a4cce43c2c991ab 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -193,6 +193,20 @@ class TestUnpoolOpException(unittest.TestCase):
             ).astype("int32")
             F.max_unpool2d(data, indices, kernel_size=2, stride=2)
 
+        def x_rank_error():
+            data = paddle.rand(shape=[1, 1, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 9), shape=[1, 1, 3, 3]
+            ).astype("int32")
+            F.max_unpool2d(data, indices, kernel_size=2, stride=2)
+
+        def indices_rank_error():
+            data = paddle.rand(shape=[1, 1, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 9), shape=[1, 3, 3]
+            ).astype("int32")
+            F.max_unpool2d(data, indices, kernel_size=2, stride=2)
+
         def indices_value_error():
             data = paddle.rand(shape=[1, 1, 3, 3])
             indices = paddle.reshape(
@@ -232,6 +246,16 @@ class TestUnpoolOpException(unittest.TestCase):
             r"The dimensions of Input\(X\) must equal to",
             indices_size_error,
         )
+        self.assertRaisesRegex(
+            ValueError,
+            r"The x should have \[N, C, H, W\] format",
+            x_rank_error,
+        )
+        self.assertRaisesRegex(
+            ValueError,
+            r"The indices should have \[N, C, H, W\] format",
+            indices_rank_error,
+        )
         if not core.is_compiled_with_cuda():
             self.assertRaisesRegex(
                 ValueError,
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 17a05bdb01caab7e930ad37e8c703e0ea4f557fe..f649fe1a28152872519f62cabe910379111a5093 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -20,7 +20,6 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.param_attr import WeightNormParamAttr
 
 
@@ -44,7 +43,7 @@ class TestWeightNormalization(unittest.TestCase):
             weight_attr=WeightNormParamAttr(
                 dim=None,
                 name='weight_norm_param',
-                initializer=ConstantInitializer(1.0),
+                initializer=paddle.nn.initializer.Constant(1.0),
             ),
             bias_attr=False,
             activation=None,
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index 2d07ab31334df753c246d698536a276ab8a0726b..9f005c943310408527f6361a6e8a5d2cff073d74 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -84,6 +84,9 @@ unary_api_list = [
     paddle.poisson,
     paddle.bernoulli,
     paddle.median,
+    paddle.nn.functional.softmax,
+    paddle.nn.functional.log_softmax,
+    paddle.nn.functional.gumbel_softmax,
 ]
 
 inplace_api_list = [
@@ -189,6 +192,8 @@ reduce_api_list = [
     paddle.logsumexp,
     paddle.all,
     paddle.any,
+    paddle.argmax,
+    paddle.argmin,
 ]
 
 
@@ -208,12 +213,13 @@ class TestReduceAPI(unittest.TestCase):
             out.retain_grads()
             out.backward()
 
-            out_empty_list = api(x, [])
-            self.assertEqual(out_empty_list, out)
-
             self.assertEqual(x.shape, [])
             self.assertEqual(out.shape, [])
-            np.testing.assert_allclose(out.numpy(), x.numpy())
+            if api not in [paddle.argmax, paddle.argmin]:
+                np.testing.assert_allclose(out.numpy(), x.numpy())
+                out_empty_list = api(x, [])
+                self.assertEqual(out_empty_list, out)
+
             if x.grad is not None:
                 self.assertEqual(x.grad.shape, [])
                 self.assertEqual(out.grad.shape, [])
@@ -250,7 +256,9 @@ class TestReduceAPI(unittest.TestCase):
                 res = exe.run(main_prog, fetch_list=fetch_list)
                 self.assertEqual(res[0].shape, ())
                 self.assertEqual(res[1].shape, ())
-                np.testing.assert_allclose(res[0], res[1])
+                if api not in [paddle.argmax, paddle.argmin]:
+                    np.testing.assert_allclose(res[0], res[1])
+
                 if len(res) > 2:
                     self.assertEqual(res[2].shape, ())
                     self.assertEqual(res[3].shape, ())
@@ -1496,6 +1504,26 @@ class TestSundryAPI(unittest.TestCase):
         self.assertEqual(out.grad.shape, [])
         self.assertEqual(x.grad.shape, [])
 
+    def test_prelu(self):
+        x = paddle.full([], 1.0, 'float32')
+        x.stop_gradient = False
+
+        w1 = paddle.to_tensor([0.25], dtype='float32')
+        out1 = paddle.nn.functional.prelu(x, w1)
+        out1.retain_grads()
+        out1.backward()
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+        w2 = paddle.full([], 0.25, dtype='float32')
+        out2 = paddle.nn.functional.prelu(x, w2)
+        out2.retain_grads()
+        out2.backward()
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
 
 class TestSundryAPIStatic(unittest.TestCase):
     def setUp(self):
@@ -2398,6 +2426,38 @@ class TestSundryAPIStatic(unittest.TestCase):
         res = self.exe.run(prog, feed={"x": x_tensor}, fetch_list=[out])
         self.assertEqual(res[0].shape, (3, 4, 2))
 
+    def test_prelu(self):
+        x1 = paddle.full([], 1.0, 'float32')
+        x1.stop_gradient = False
+        w1 = paddle.to_tensor([0.25], dtype='float32')
+        out1 = paddle.nn.functional.prelu(x1, w1)
+        paddle.static.append_backward(out1.sum())
+
+        x2 = paddle.full([], 1.0, 'float32')
+        x2.stop_gradient = False
+        w2 = paddle.full([], 0.25, dtype='float32')
+        out2 = paddle.nn.functional.prelu(x2, w2)
+        paddle.static.append_backward(out2.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                x1.grad_name,
+                x2.grad_name,
+                out1.grad_name,
+                out2.grad_name,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[5].shape, ())
+
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
 class TestNoBackwardAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 42436b6e242b445bfceee93996c03dac7f13233b..d847ac9ee443346fa9538d25ef001b236078d94d 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -76,8 +76,8 @@ def multi_head_attention(
         q = paddle.static.nn.fc(
             x=queries,
             size=d_key * n_head,
-            weight_attr=fluid.initializer.Xavier(
-                uniform=False, fan_in=d_model * d_key, fan_out=n_head * d_key
+            weight_attr=paddle.nn.initializer.XavierNormal(
+                fan_in=d_model * d_key, fan_out=n_head * d_key
             ),
             bias_attr=False,
             num_flatten_dims=2,
@@ -85,8 +85,8 @@ def multi_head_attention(
         k = paddle.static.nn.fc(
             x=keys,
             size=d_key * n_head,
-            weight_attr=fluid.initializer.Xavier(
-                uniform=False, fan_in=d_model * d_key, fan_out=n_head * d_key
+            weight_attr=paddle.nn.initializer.XavierNormal(
+                fan_in=d_model * d_key, fan_out=n_head * d_key
             ),
             bias_attr=False,
             num_flatten_dims=2,
@@ -94,8 +94,7 @@ def multi_head_attention(
         v = paddle.static.nn.fc(
             x=values,
             size=d_value * n_head,
-            weight_attr=fluid.initializer.Xavier(
-                uniform=False,
+            weight_attr=paddle.nn.initializer.XavierNormal(
                 fan_in=d_model * d_value,
                 fan_out=n_head * d_value,
             ),
@@ -187,7 +186,7 @@ def multi_head_attention(
     proj_out = paddle.static.nn.fc(
         x=out,
         size=d_model,
-        weight_attr=fluid.initializer.Xavier(uniform=False),
+        weight_attr=paddle.nn.initializer.XavierNormal(),
         bias_attr=False,
         num_flatten_dims=2,
     )
@@ -204,7 +203,7 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
         x,
         size=d_inner_hid,
         num_flatten_dims=2,
-        weight_attr=fluid.initializer.Uniform(
+        weight_attr=paddle.nn.initializer.Uniform(
             low=-(d_hid**-0.5), high=(d_hid**-0.5)
         ),
         activation="relu",
@@ -213,7 +212,7 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
         x=hidden,
         size=d_hid,
         num_flatten_dims=2,
-        weight_attr=fluid.initializer.Uniform(
+        weight_attr=paddle.nn.initializer.Uniform(
             low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)
         ),
     )
@@ -235,8 +234,8 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.0):
             out = paddle.static.nn.layer_norm(
                 out,
                 begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.0),
-                bias_attr=fluid.initializer.Constant(0.0),
+                param_attr=paddle.nn.initializer.Constant(1.0),
+                bias_attr=paddle.nn.initializer.Constant(0.0),
             )
         elif cmd == "d":  # add dropout
             if dropout:
@@ -269,7 +268,7 @@ def prepare_encoder(
         src_word,
         size=[src_vocab_size, src_emb_dim],
         padding_idx=src_pad_idx,
-        param_attr=fluid.initializer.Normal(0.0, 1.0),
+        param_attr=paddle.nn.initializer.Normal(0.0, 1.0),
     )
     src_pos_enc = layers.embedding(
         src_pos,
@@ -587,7 +586,7 @@ def transformer(
         x=paddle.static.nn.fc(
             x=dec_output,
             size=trg_vocab_size,
-            weight_attr=fluid.initializer.Xavier(uniform=False),
+            weight_attr=paddle.nn.initializer.XavierNormal(),
             bias_attr=False,
             num_flatten_dims=2,
         ),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index d3909193cd6cee67547c5abff8f59d96922668d0..3ee0469b6145d193cd8cf427c69574fc14947fdf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -369,7 +369,7 @@ class XPUTestBatchNormOp(XPUOpTestWrapper):
                     net1 = paddle.nn.BatchNorm(
                         6,
                         param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.0)
+                            initializer=paddle.nn.initializer.Constant(1.0)
                         ),
                         use_global_stats=self.use_global_stats,
                         trainable_statistics=self.trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
index 3518083d75678911e56cab2ec93593de09544704..1764400403f268c42fe05f6f9c0cd1c2fdfdef22 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -73,34 +73,34 @@ class XPUTestResNetBasicBlockOp(XPUOpTestWrapper):
             paddle.disable_static()
 
             conv1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             bn1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             bn1_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             bn2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             bn2_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             bn3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             bn3_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
 
             self.conv1 = nn.Conv2D(
@@ -173,34 +173,34 @@ class XPUTestResNetBasicBlockOp(XPUOpTestWrapper):
             paddle.disable_static()
 
             fused_conv1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             fused_conv2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             fused_conv3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             fused_bn1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             fused_bn1_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             fused_bn2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             fused_bn2_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             fused_bn3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             fused_bn3_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
 
             if self.has_shortcut:
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
index 3b2deaf4396bbc95e740674c88e9c4532ad567d9..c92ddc9531b2121814830e33db0772645608b494 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
@@ -170,6 +170,7 @@ class XPUTestGridSamplerOP(XPUOpTestWrapper):
             self.place = paddle.XPUPlace(0)
             self.init_dtype()
             self.op_type = 'grid_sampler'
+            self.epsilon_xpu2xpu = 0.000001
 
             self.use_cudnn = False
             self.align_corners = True
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
index 2f699ca3c026d69795b070afa191f08c8819b0b6..666c29f7fcaa8d521968b0fe092878669af0bd14 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
@@ -163,7 +163,7 @@ def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
         shape=alpha_shape,
         dtype='float32',
         is_bias=False,
-        default_initializer=fluid.initializer.ConstantInitializer(0.25),
+        default_initializer=paddle.nn.initializer.Constant(0.25),
     )
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
index f6f64aefe9db7cc936d06a2616b3c48171b36a3d..35e98e3cdaa75c60cb99a230492c6088d4d06dca 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -132,6 +132,8 @@ reduce_api_list = [
     paddle.logsumexp,
     paddle.all,
     paddle.any,
+    paddle.argmax,
+    paddle.argmin,
 ]
 
 
@@ -153,7 +155,8 @@ class TestReduceAPI(unittest.TestCase):
 
             self.assertEqual(x.shape, [])
             self.assertEqual(out.shape, [])
-            np.testing.assert_allclose(out.numpy(), x.numpy())
+            if api not in [paddle.argmax, paddle.argmin]:
+                np.testing.assert_allclose(out.numpy(), x.numpy())
             if x.grad is not None:
                 self.assertEqual(x.grad.shape, [])
                 self.assertEqual(out.grad.shape, [])
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 29901363dbeef809eeecc2eb135244117aafbde0..32486a8dadd2b0f8bcd9ded8f42a08faf6258df6 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,7 +39,7 @@ import logging
 import numpy as np
 
 from .ps_dispatcher import RoundRobin, PSDispatcher
-from .. import core, framework, unique_name, initializer
+from .. import core, framework, unique_name
 from ..framework import (
     Program,
     default_main_program,
@@ -2856,7 +2856,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
                             dtype=var.dtype,
                             shape=var.shape,
                             persistable=var.persistable,
-                            initializer=initializer.Constant(1),
+                            initializer=paddle.nn.initializer.Constant(1),
                         )
                     op_role_attr_name = (
                         core.op_proto_and_checker_maker.kOpRoleAttrName()
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index df1c81bffe83507e3442fae89fd952538f397b07..7bf04dc151c7fe5b40226f62e77a46898f2d4937 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -24,7 +24,6 @@ import numpy as np
 import paddle
 from paddle.fluid import core, global_scope, program_guard
 from paddle.fluid.framework import dygraph_only
-from paddle.fluid.initializer import ConstantInitializer
 from paddle.incubate import asp
 
 from .supported_layer_list import (
@@ -882,7 +881,9 @@ class ASPHelper:
                             name=ASPHelper._get_mask_name(param.name),
                             shape=param.shape,
                             dtype=param.dtype,
-                            default_initializer=ConstantInitializer(value=1.0),
+                            default_initializer=paddle.nn.initializer.Constant(
+                                value=1.0
+                            ),
                         )
                         mask_param.stop_gradient = True
                         mask_param.trainable = False
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 01d2161b22342d9c1f2fd70c814f0982fce74b4a..19ec0ad2458d77642d11dffad8de8468b77b7a70 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -615,6 +615,11 @@ def fused_multi_head_attention(
         'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
     )  # semantic transfer
 
+    if x.ndim != 3:
+        raise ValueError(
+            f"The rank of the x should be 3, but received {x.ndim}."
+        )
+
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 19d44b8e35c8ba5d9d6f414eacd4c4a266eac856..fd509d74e532abd82eaf5c0e0ba8c6132b5fef86 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -576,9 +576,7 @@ class PartialProgramLayer:
                 core.check_and_set_prim_all_enabled()
             backward.gradients(targets=targets, inputs=[])
 
-        start_idx = len(main_program.block(0).ops) + 2 * len(
-            self._outputs.tolist()
-        )
+        start_idx = len(main_program.block(0).ops) + len(self._outputs.tolist())
 
         self.prepare_gradient_aggregation(start_idx, main_program, program)
 
@@ -753,7 +751,7 @@ class PartialProgramLayer:
     ):
         # NOTE(dev): We apply build_strategy for backward firstly to
         # avoid skipping more gc variables.
-        backward_start_op_index = forward_end_op_index + 2 * len(
+        backward_start_op_index = forward_end_op_index + len(
             self._outputs.var_ids
         )
         backward_end_op_index = whole_program.desc.block(0).op_size()
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 5a66cd103a7fe70efb455b2840b09e86fed67f53..f32168858b9d3a1120647921ec2ae5242004e527 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -16,6 +16,7 @@ import collections
 import inspect
 import textwrap
 import threading
+import warnings
 import weakref
 
 from paddle.fluid import _non_static_mode, core, framework
@@ -1077,11 +1078,56 @@ class ParametersRecorder:
         return id(program)
 
 
+class FallbackProgramLayer(object):
+    __slots__ = [
+        '_instance',
+        '_dy_func',
+        'training',
+        '_cuda_graph_capture_mode',
+        '_cuda_graph_pool_id',
+    ]
+
+    def __init__(self, instance, dy_func):
+        self._instance = instance
+        self._dy_func = dy_func
+
+    def __call__(self, inputs):
+        return self._dy_func(*inputs)
+
+    def __getattr__(self, key):
+        if key not in self.__slots__:
+            raise RuntimeError(
+                "There raises a exception after applying `@paddle.jit.to_static()` and already switch into fallback mode. \n"
+                "You can't get attribute for a fallback program layer. Please check `to_static.error` file for detail."
+            )
+        elif key in ['training']:
+            if self._instance is not None:
+                return getattr(self._instance, key)
+            return
+
+        return super().__getattr__(key)
+
+    def __setattr__(self, key, value):
+        if key not in self.__slots__:
+            raise RuntimeError(
+                "There raises a exception after applying `@paddle.jit.to_static()` and already switch into fallback mode. \n"
+                "You can't get attribute for a fallback program layer. Please check `to_static.error` file for detail."
+            )
+        elif key in ['training']:
+            if self._instance is not None:
+                return setattr(self._instance, key, value)
+            return
+
+        return super().__setattr__(key, value)
+
+
 class ProgramCache:
     """
     Wrapper class for the program functions defined by dygraph function.
     """
 
+    dy2static_error_file = "to_static.error"
+
     def __init__(self):
         # {hash_id : (concrete_program, partial_layer)}
         self._caches = collections.OrderedDict()
@@ -1092,17 +1138,37 @@ class ProgramCache:
     def _build_once(self, cache_key):
         # TODO(Aurelius84): Need a gloabl FLAGS to enable/disable to_prim
         enable_prim = cache_key.kwargs['build_strategy'].build_cinn_pass
+        # NOTE(xiongkun): Need a global FLAGS to enable/disable fallback
+        enable_fallback = enable_prim
         if enable_prim:
             # TODO(Jiabin): Change this to True if we need this to be default option
             core.check_and_set_prim_all_enabled()
+        try:
+            concrete_program = ConcreteProgram.from_func_spec(
+                func_spec=cache_key.function_spec,
+                input_spec=cache_key.input_args_with_spec,
+                input_kwargs_spec=cache_key.input_kwargs_with_spec,
+                class_instance=cache_key.class_instance,
+                **cache_key.kwargs
+            )
+        except Exception as e:
+            if enable_fallback:
+                warnings.warn(
+                    "Exception is thrown while applying @paddle.jit.to_static. It will fallback into dygraph mode for training.\n"
+                    "1. You can check `to_static.error` file in current workspace directory for detail.\n"
+                    "2. In fallback mode, you can only do training, can't call paddle.jit.save(). Please modify model code according `to_static.error` firstly"
+                )
+                # TODO(xiongkun) change different file name to avoid overwrite.
+                with open(self.dy2static_error_file, "w") as fp:
+                    fp.write(str(e))
 
-        concrete_program = ConcreteProgram.from_func_spec(
-            func_spec=cache_key.function_spec,
-            input_spec=cache_key.input_args_with_spec,
-            input_kwargs_spec=cache_key.input_kwargs_with_spec,
-            class_instance=cache_key.class_instance,
-            **cache_key.kwargs
-        )
+                fallback_layer = FallbackProgramLayer(
+                    cache_key.class_instance,
+                    cache_key.function_spec.dygraph_function,
+                )
+                return fallback_layer, fallback_layer
+            else:
+                raise
 
         concrete_program._to_prim()
         return concrete_program, partial_program_from(concrete_program)
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 4397728576ba755a618f706954cda16b45d3f6aa..cad4c29556121a1435684445cbc25cf9631a67f4 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -1512,12 +1512,11 @@ def _out_grad_names(program_desc, fwd_end_op_index, out_size):
     """
     names = []
     for i in range(
-        fwd_end_op_index + 1,
-        min(fwd_end_op_index + 2 * out_size, program_desc.block(0).op_size()),
-        2,
+        fwd_end_op_index,
+        min(fwd_end_op_index + out_size, program_desc.block(0).op_size()),
     ):
         op = program_desc.block(0).op(i)
-        if op.type() == 'fill_constant':
+        if op.type() == 'fill_any_like':
             var_name = op.output('Out')[0]
             names.append(var_name)
     return names
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index c488c758f4a262fdb4537f8d9a2c26d6248e4bbe..45563584f166d00128858c10c40b56068f2ada3a 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -373,7 +373,7 @@ class _ProgramHolder:
     @switch_to_static_graph
     def _create_backward_train_program(self):
         whole_program = _build_program_by_desc(self._train_program_desc)
-        start_op_index = self._infer_program_desc.block(0).op_size() + 2 * len(
+        start_op_index = self._infer_program_desc.block(0).op_size() + len(
             self._output_descs
         )
         end_op_index = whole_program.desc.block(0).op_size()
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index 4ad72077014f0998c5e6f3a9ae4621f56828942c..4ce504d8f8b6681b195582428494cc723ba5660f 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -19,8 +19,8 @@ import warnings
 import numpy as np
 
 import paddle
+from paddle.common_ops_import import default_main_program
 from paddle.framework import _non_static_mode
-from paddle.static import default_main_program
 
 from ..fluid.data_feeder import convert_dtype
 from ..fluid.layers.utils import flatten, map_structure
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index f4f4bb6b9f891d368a74097cd14061b1cb8a6ea5..ed1bbc0a54addf1103d22f58b536951e933bdba6 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -463,7 +463,7 @@ def prelu(x, weight, data_format="NCHW", name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         weight (Tensor): The learnable parameter with data type same as ``x``.
-            The weight shape is [1] or [in], where `in` is the input channel of ``x``.
+            The weight shape is [], [1] or [in], where `in` is the input channel of ``x``.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
         data_format(str, optional): Data format that specifies the layout of input.
             It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
@@ -495,12 +495,11 @@ def prelu(x, weight, data_format="NCHW", name=None):
             #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
     """
     assert (
-        len(weight.shape) == 1
-    ), "The dim count of weight shape should be 1 in prelu()."
+        len(weight.shape) == 0 or len(weight.shape) == 1
+    ), "The dim count of weight shape should be 0 or 1 in prelu()."
 
     mode = 'all'
-    if weight.shape[0] > 1:
-
+    if len(weight.shape) == 1 and weight.shape[0] > 1:
         true_data_format = [
             'NC',
             'NCL',
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 57a1e0023d4fcb6cc152dbdbde7e8e1211b73488..d8777d2c4779d9c8b106a9131643a9c132d2ddb3 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -16,10 +16,10 @@ import numpy
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops
+from paddle.common_ops_import import Variable, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.tensor import fill_constant
 from paddle.framework import core, in_dynamic_mode
-from paddle.static import Variable, default_main_program
 from paddle.tensor.creation import full
 
 from ...fluid.data_feeder import (
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 74a97e25938ed300620dcb997205985176f74ca8..2ad865c47995fd802000e69759d7c23d50b75d7a 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -22,6 +22,7 @@ from paddle.device import (
 from paddle.fluid.framework import _global_flags, in_dygraph_mode
 from paddle.tensor.math import _add_with_axis
 
+from ...common_ops_import import Variable
 from ...device import get_cudnn_version
 from ...fluid.data_feeder import check_dtype, check_variable_and_dtype
 from ...fluid.layer_helper import LayerHelper
@@ -32,7 +33,6 @@ from ...fluid.layers.utils import (
     convert_to_list,
 )
 from ...framework import no_grad
-from ...static import Variable
 from ...tensor.manipulation import squeeze, unsqueeze
 
 __all__ = []
@@ -1204,6 +1204,12 @@ def conv2d_transpose(
                 x.shape
             )
         )
+    if len(weight.shape) != 4:
+        raise ValueError(
+            "Input weight should be 4D tensor, but received weight with the shape of {}".format(
+                weight.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError(
@@ -1678,6 +1684,12 @@ def conv3d_transpose(
                 x.shape
             )
         )
+    if len(weight.shape) != 5:
+        raise ValueError(
+            "Input weight should be 5D tensor, but received weight with the shape of {}".format(
+                weight.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[1]
     if num_channels < 0:
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 3b566b3de304428e14c47abed9e1e3b3e3d6f8e6..533bf138a1a49da2624ae27f160df9dca097f172 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -18,6 +18,7 @@ import numpy as np
 
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 
+from ...common_ops_import import Variable
 from ...fluid.data_feeder import (
     check_dtype,
     check_type,
@@ -26,7 +27,6 @@ from ...fluid.data_feeder import (
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layer_helper import LayerHelper
 from ...framework import convert_np_dtype_to_dtype_, core
-from ...static import Variable
 from ...tensor.creation import assign
 from ...tensor.layer_function_generator import templatedoc
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 8964b69df2a71a555d63b13ead3646a22a6fa837..eccaffcb729a84ed958cac8772ecb3cffc056a2c 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -14,10 +14,10 @@
 
 from paddle import _C_ops
 
+from ...common_ops_import import Variable
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layer_helper import LayerHelper
-from ...static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 90697cb63476fdb882f34496d9f68197762dc90f..a441183ca82344ddbdfc4a3e02501408666a9101 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -20,10 +20,10 @@ from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 from paddle.framework import core
 from paddle.utils import deprecated
 
+from ...common_ops_import import Variable
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.framework import _current_expected_place, in_dygraph_mode
 from ...fluid.layer_helper import LayerHelper
-from ...static import Variable
 from ...tensor.manipulation import reshape
 
 __all__ = []
@@ -254,19 +254,41 @@ def fluid_softmax_with_cross_entropy(
             #        [1.15328646])
     """
     if in_dygraph_mode():
-        if core.is_compiled_with_npu():
-            softmax, backprop, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                logits,
-                label,
-                'soft_label',
-                soft_label,
-                'ignore_index',
-                ignore_index,
-                'numeric_stable_mode',
-                numeric_stable_mode,
-                'axis',
-                axis,
-            )
+        if core.is_compiled_with_custom_device("npu"):
+            if not soft_label:
+                valid_label = (
+                    paddle.cast(label != ignore_index, dtype=label.dtype)
+                    * label
+                )
+                softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
+                    logits,
+                    valid_label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    numeric_stable_mode,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    True,
+                )
+            else:
+                softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
+                    logits,
+                    label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    numeric_stable_mode,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    True,
+                )
         else:
             softmax, loss = _C_ops.cross_entropy_with_softmax(
                 logits,
@@ -293,7 +315,9 @@ def fluid_softmax_with_cross_entropy(
         loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
 
         outputs = {'Softmax': softmax, 'Loss': loss}
-        if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
+        if core.is_compiled_with_custom_device(
+            "npu"
+        ) or core.is_compiled_with_custom_device("mlu"):
             backprop = helper.create_variable_for_type_inference(
                 dtype=logits.dtype
             )
@@ -2573,7 +2597,9 @@ def cross_entropy(
             valid_label = (
                 paddle.cast(label != ignore_index, dtype=label.dtype) * label
             )
-        if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
+        if core.is_compiled_with_custom_device(
+            "npu"
+        ) or core.is_compiled_with_custom_device("mlu"):
             if not soft_label:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
                     input,
@@ -2744,7 +2770,9 @@ def cross_entropy(
         out = helper.create_variable_for_type_inference(dtype=input.dtype)
 
         outputs = {'Softmax': softmax, 'Loss': out}
-        if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
+        if core.is_compiled_with_custom_device(
+            "npu"
+        ) or core.is_compiled_with_custom_device("mlu"):
             backprop = helper.create_variable_for_type_inference(
                 dtype=input.dtype
             )
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 87723f6bb8f58c6d16ab77760f5408bbdec95208..0c194d45ed8d3e0d9b8e585963231ff61cc11444 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -77,6 +77,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
             #        [[0.        , 0.24253564, 0.37139067],
             #         [1.        , 0.97014254, 0.92847669]])
     """
+
     if in_dygraph_mode():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
         out = _C_ops.p_norm(x, float(p), axis, epsilon, True, False)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 6aab78ec116e93d3e484310d1fb4c2df3549272b..03e7f202c531a6613543920ea07896b6a4e1d1ba 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -927,6 +927,15 @@ def max_unpool2d(
             # unpool_out shape: [1, 1, 7, 7]
 
     """
+    if x.ndim != 4:
+        raise ValueError(
+            f'The x should have [N, C, H, W] format, but received {x.shape}.'
+        )
+    if indices.ndim != 4:
+        raise ValueError(
+            f'The indices should have [N, C, H, W] format, but received {indices.shape}.'
+        )
+
     kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
         stride = kernel_size
@@ -1061,6 +1070,15 @@ def max_unpool3d(
             # unpool_out shape: [1, 1, 4, 4, 6]
 
     """
+    if x.ndim != 5:
+        raise ValueError(
+            f'The x should have [N, C, D, H, W] format, but received {x.shape}.'
+        )
+    if indices.ndim != 5:
+        raise ValueError(
+            f'The indices should have [N, C, D, H, W] format, but received {indices.shape}.'
+        )
+
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
     if stride is None:
         stride = kernel_size
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 4f164e991f3280af801d35ba881cfa564aa188ae..1178928acc2dabfe387f90f25076794027019a44 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -15,10 +15,10 @@
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 from paddle.fluid.framework import in_dygraph_mode
 
+from ...common_ops_import import Variable
 from ...device import get_cudnn_version, is_compiled_with_rocm
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.layer_helper import LayerHelper
-from ...static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/nn/initializer/Bilinear.py b/python/paddle/nn/initializer/Bilinear.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a1766d07cccc35f16011fa91d97754fc80dbd3
--- /dev/null
+++ b/python/paddle/nn/initializer/Bilinear.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
+
+__all__ = []
+
+
+class Bilinear(Initializer):
+    """
+    This initializer can be used in transposed convolution operator to
+    act as upsampling. Users can upsample a feature map with shape of
+    (B, C, H, W) by any integer factor. The usage is:
+
+    Examples:
+
+        .. code-block:: python
+
+            import math
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.regularizer import L2Decay
+
+            factor = 2
+            C = 2
+            B = 8
+            H = W = 32
+            w_attr = paddle.ParamAttr(learning_rate=0.,
+                                      regularizer=L2Decay(0.),
+                                      initializer=nn.initializer.Bilinear())
+            data = paddle.rand([B, 3, H, W], dtype='float32')
+            conv_up = nn.Conv2DTranspose(3,
+                                         out_channels=C,
+                                         kernel_size=2 * factor - factor % 2,
+                                         padding=int(
+                                             math.ceil((factor - 1) / 2.)),
+                                         stride=factor,
+                                         weight_attr=w_attr,
+                                         bias_attr=False)
+            x = conv_up(data)
+
+    Where, `out_channels=C` and `groups=C` means this is channel-wise transposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `kernel_size`,
+    This initializer will set a (K, K) interpolation kernel for every channel
+    of the filter identically. The resulting shape of the output feature map
+    will be (B, C, factor * H, factor * W). Note that the learning rate and the
+    weight decay are set to 0 in order to keep coefficient values of bilinear
+    interpolation unchanged during training.
+
+    """
+
+    def __init__(self):
+        """Constructor for BilinearInitializer."""
+        super().__init__()
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Bilinear initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        if not isinstance(var, framework.Variable):
+            raise ValueError("var must be framework.Variable.")
+
+        if not isinstance(block, framework.Block):
+            raise ValueError("block must be framework.Block.")
+
+        shape = var.shape
+        if len(shape) != 4:
+            raise ValueError("the length of shape must be 4.")
+        if shape[2] != shape[3]:
+            raise ValueError("shape[2] must be equal to shape[3].")
+
+        weight = np.zeros(np.prod(var.shape), dtype='float32')
+        size = shape[3]
+        # factor
+        f = np.ceil(size / 2.0)
+        # center
+        c = (2 * f - 1 - f % 2) / (2.0 * f)
+        for i in range(np.prod(shape)):
+            x = i % size
+            y = (i / size) % size
+            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+        weight = np.reshape(weight, shape)
+
+        # to be compatible of fp16 initalizers
+        if var.dtype in [
+            core.VarDesc.VarType.FP16,
+            core.VarDesc.VarType.BF16,
+            core.VarDesc.VarType.FP64,
+        ]:
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['bilinear_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if out_dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in weight.flat]
+        else:
+            raise TypeError("Unsupported dtype %s", var.dtype)
+
+        if np.prod(shape) > 1024 * 1024:
+            raise ValueError("The size of input is too big. ")
+
+        if in_dygraph_mode():
+            _C_ops.assign_value_(
+                out_var,
+                list(shape),
+                out_dtype,
+                values,
+                _current_expected_place(),
+            )
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+                core.VarDesc.VarType.FP64,
+            ]:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            op = block.append_op(
+                type='assign_value',
+                outputs={'Out': [out_var]},
+                attrs={
+                    'dtype': out_dtype,
+                    'shape': list(shape),
+                    value_name: values,
+                },
+            )
+
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+                core.VarDesc.VarType.FP64,
+            ]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index e078e19ed2b4db8b40c629bafab89451e61433b5..6ef516c8b6af5ae58e2c11b3eb2bf3474a83b9d2 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 # TODO: define the initializers to create a Parameter in neural network
-from ...fluid.initializer import Bilinear  # noqa: F401
 from ...fluid.initializer import set_global_initializer  # noqa: F401
-from ...fluid.initializer import calculate_gain  # noqa: F401
+
+from .Bilinear import Bilinear  # noqa: F401
 
 from .constant import Constant  # noqa: F401
 
@@ -36,6 +36,15 @@ from .orthogonal import Orthogonal  # noqa: F401
 
 from .dirac import Dirac  # noqa: F401
 
+from .initializer import Initializer, calculate_gain  # noqa: F401
+from .uniform import UniformInitializer  # noqa: F401
+from .constant import ConstantInitializer  # noqa: F401
+from .normal import NormalInitializer  # noqa: F401
+from .normal import TruncatedNormalInitializer  # noqa: F401
+from .xavier import XavierInitializer  # noqa: F401
+from .kaiming import MSRAInitializer  # noqa: F401
+from .assign import NumpyArrayInitializer  # noqa: F401
+
 __all__ = [  # noqa
     'Bilinear',
     'Constant',
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index 052da37af244e05a214a36f4ef2632c603925ca9..3ab5a896e463a7b63d6616f3db4a150fca8d728c 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -12,20 +12,134 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
+from paddle import _C_ops
 
+from ...fluid import core, framework, unique_name
 from ...fluid.data_feeder import check_type
-from ...fluid.initializer import NumpyArrayInitializer
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class NumpyArrayInitializer(Initializer):
+    """Init an parameter with an numpy array
+    This api initialize the tensor by numpy array.
+
+    Args:
+        value (numpy): numpy array to initialize the tensor
+
+    Returns:
+        A Tensor initialized by numpy.
+
+    """
+
+    def __init__(self, value):
+        import numpy
+
+        assert isinstance(value, numpy.ndarray)
+        super().__init__()
+        self._value = value
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Numpy array.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+
+        # to be compatible of fp16 initalizers
+        if var.dtype in [core.VarDesc.VarType.FP16, core.VarDesc.VarType.BF16]:
+            out_dtype = core.VarDesc.VarType.FP32
+            np_value = self._value.astype("float32")
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['numpy_array_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_var = var
+            out_dtype = var.dtype
+            np_value = self._value
+
+        if out_dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in np_value.flat]
+        elif out_dtype == core.VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in np_value.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", self._value.dtype)
+        if self._value.size > 1024 * 1024 * 1024:
+            raise ValueError(
+                "The size of input is too big. Please consider "
+                "saving it to file and 'load_op' to load it"
+            )
+
+        if in_dygraph_mode():
+            _C_ops.assign_value_(
+                out_var,
+                list(self._value.shape),
+                out_dtype,
+                values,
+                _current_expected_place(),
+            )
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            op = block.append_op(
+                type='assign_value',
+                outputs={'Out': out_var},
+                attrs={
+                    'dtype': out_dtype,
+                    'shape': list(self._value.shape),
+                    value_name: values,
+                },
+                stop_gradient=True,
+            )
+
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class Assign(NumpyArrayInitializer):
     """Init an parameter with a numpy array, list, or tensor.
 
     Args:
         value (Tensor|numpy.ndarray|list|tuple): numpy array, list, tuple, or tensor to initialize the parameter.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        name(str, optional): Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
         A parameter initialized by the input numpy array, list, or tensor.
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 637ae6299005cc7ab47bda43d3b231bfaa929f4e..0016467f117b048bb7e69dd57bc0e0851d3ed2d4 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -12,12 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle import _C_ops
+
+from ...fluid import core, framework
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+
 # TODO: define the initializers of Constant in neural network
-from ...fluid.initializer import ConstantInitializer
+from .initializer import Initializer
 
 __all__ = []
 
 
+class ConstantInitializer(Initializer):
+    """Implements the constant initializer
+
+    Args:
+        value (float32, optional): constant value to initialize the variable. Default: 0.0.
+
+    """
+
+    def __init__(self, value=0.0, force_cpu=False):
+        assert value is not None
+        super().__init__()
+        self._value = value
+        self._force_cpu = force_cpu
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with constant.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable) or isinstance(
+            var, framework.EagerParamBase
+        )
+        assert isinstance(block, framework.Block)
+
+        if in_dygraph_mode():
+            place = _current_expected_place()
+            if self._force_cpu:
+                place = core.CPUPlace()
+            _C_ops.full_(
+                var, var.shape, str(float(self._value)), var.dtype, place
+            )
+            return None
+        else:
+            op = block.append_op(
+                type="fill_constant",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "value": float(self._value),
+                    'str_value': str(float(self._value)),
+                    'force_cpu': self._force_cpu,
+                },
+                stop_gradient=True,
+            )
+
+            var.op = op
+            return op
+
+
 class Constant(ConstantInitializer):
     """Implement the constant initializer.
 
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 0917859415d365df8b692e956e362491d3e4748d..3abcc300bc64e164d879088163670b3a0d2d90a6 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -20,7 +20,7 @@ from ...fluid import framework
 from ...fluid.core import VarDesc
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.framework import _current_expected_place
-from ...fluid.initializer import Initializer
+from .initializer import Initializer
 
 __all__ = []
 
diff --git a/python/paddle/nn/initializer/initializer.py b/python/paddle/nn/initializer/initializer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c320fa68cd114b1b72e051da67866948568384af
--- /dev/null
+++ b/python/paddle/nn/initializer/initializer.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import math
+
+import numpy as np
+
+from ...fluid.framework import default_main_program, in_dygraph_mode
+from ...fluid.lazy_init import lazy_init_helper
+
+__all__ = []
+
+
+class Initializer:
+    """Base class for parameter initializers
+
+    Defines the common interface of parameter initializers.
+    They add operations to the init program that are used
+    to initialize parameter. Users should not use this class
+    directly, but need to use one of its implementations.
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, param, block=None):
+        if not lazy_init_helper().state:
+            return self.forward(param, block)
+
+        return self._lazy_init(param, block)
+
+    def forward(self, param, block=None):
+        """Add corresponding initialization operations to the network"""
+        raise NotImplementedError()
+
+    def _lazy_init(self, param, block=None):
+        """
+        Apply lazy initialization
+        """
+        assert in_dygraph_mode()
+
+        def init_op_creator(forward, param, block):
+            new_var = param._to_static_var(True, block=block)
+            # Record initializer operator
+            with lazy_init_helper():
+                forward(new_var, block)
+
+        # Add hook function for initializing param in dygraph mode
+        param.set_init_func(functools.partial(self.forward, param, block))
+        param._init_op_creator = functools.partial(
+            init_op_creator, self.forward, param
+        )
+
+        return param
+
+    def _check_block(self, block):
+        if block is None:
+            block = default_main_program().global_block()
+
+        return block
+
+    def _compute_fans(self, var):
+        """Compute the fan_in and the fan_out for layers
+
+        This method computes the fan_in and the fan_out
+        for neural network layers, if not specified. It is
+        not possible to perfectly estimate fan_in and fan_out.
+        This method will estimate it correctly for matrix multiply and
+        convolutions.
+
+        Args:
+            var: variable for which fan_in and fan_out have to be computed
+
+        Returns:
+            tuple of two integers (fan_in, fan_out)
+        """
+        shape = var.shape
+        if not shape or len(shape) == 0:
+            fan_in = fan_out = 1
+        elif len(shape) == 1:
+            fan_in = fan_out = shape[0]
+        elif len(shape) == 2:
+            # This is the case for simple matrix multiply
+            fan_in = shape[0]
+            fan_out = shape[1]
+        else:
+            # Assume this to be a convolutional kernel
+            # In PaddlePaddle, the shape of the kernel is like:
+            # [num_filters, num_filter_channels, ...] where the remaining
+            # dimensions are the filter_size
+            receptive_field_size = np.prod(shape[2:])
+            fan_in = shape[1] * receptive_field_size
+            fan_out = shape[0] * receptive_field_size
+
+        return (fan_in, fan_out)
+
+
+def calculate_gain(nonlinearity, param=None):
+    """
+    Get the recommended ``gain`` value of some nonlinearity function. ``gain`` value can be used in some
+    ``paddle.nn.initializer`` api to adjust the initialization value.
+
+    Args:
+        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, such as:
+            `linear/conv1d/conv2d/conv3d/conv1d_transpose/conv2d_transpose/conv3d_transpose` , 1.0 will be returned.
+        param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to
+            'leaky_relu'. Default: None, it will be calculated as 0.01 in the formula.
+
+    Returns:
+        A float value, which is the recommended gain for this nonlinearity function.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3
+            gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2))
+            initializer = paddle.nn.initializer.Orthogonal(gain)
+
+    """
+    if param is None:
+        param = 0.01
+    else:
+        assert isinstance(param, (bool, int, float))
+        param = float(param)
+    recommended_gain = {
+        'sigmoid': 1,
+        'linear': 1,
+        'conv1d': 1,
+        'conv2d': 1,
+        'conv3d': 1,
+        'conv1d_transpose': 1,
+        'conv2d_transpose': 1,
+        'conv3d_transpose': 1,
+        'tanh': 5.0 / 3,
+        'relu': math.sqrt(2.0),
+        'leaky_relu': math.sqrt(2.0 / (1 + param**2)),
+        'selu': 3.0 / 4,
+    }
+    if nonlinearity in recommended_gain.keys():
+        return recommended_gain[nonlinearity]
+    else:
+        raise ValueError(
+            "nonlinearity function {} is not suppported now.".format(
+                nonlinearity
+            )
+        )
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index f214e46fa4b2b8b244ea53adb63043f1c52cb238..c3a8732315db397f1c39cecf78731b03c2d8e293 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -13,11 +13,185 @@
 # limitations under the License.
 
 # TODO: define the initializers of Kaiming functions in neural network
-from ...fluid.initializer import MSRAInitializer
+import math
+
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer, calculate_gain
 
 __all__ = []
 
 
+class MSRAInitializer(Initializer):
+    r"""Implements the MSRA initializer a.k.a. Kaiming Initializer
+
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
+
+    .. math::
+
+        x = gain \times \sqrt{\frac{3}{fan\_in}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \frac{gain}{\sqrt{{fan\_in}}}
+
+    Args:
+        uniform (bool, optional): whether to use uniform or normal distribution. Default is True.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        seed (int32, optional): random seed. Default is 0.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
+        nonlinearity(str, optional): the non-linear function. Default is relu.
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    """
+
+    def __init__(
+        self,
+        uniform=True,
+        fan_in=None,
+        seed=0,
+        negative_slope=0,
+        nonlinearity='relu',
+    ):
+        """Constructor for MSRAInitializer"""
+        assert uniform is not None
+        assert seed is not None
+        super().__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._seed = seed
+        self._negative_slope = negative_slope
+        self._nonlinearity = nonlinearity
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with MSRA initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == core.VarDesc.VarType.FP16 or (
+            var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+        ):
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['masra_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            if self._uniform:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+                out_var = _C_ops.uniform(
+                    var.shape,
+                    out_dtype,
+                    -limit,
+                    limit,
+                    self._seed,
+                    _current_expected_place(),
+                )
+            else:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
+                place = _current_expected_place()
+                out_var = _C_ops.gaussian(
+                    out_var.shape, 0.0, std, self._seed, out_dtype, place
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            if self._uniform:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+                op = block.append_op(
+                    type="uniform_random",
+                    inputs={},
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": int(out_dtype),
+                        "min": -limit,
+                        "max": limit,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+
+            else:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
+                op = block.append_op(
+                    type="gaussian_random",
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": int(out_dtype),
+                        "mean": 0.0,
+                        "std": std,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class KaimingNormal(MSRAInitializer):
     r"""Implements the Kaiming Normal initializer
 
@@ -36,9 +210,9 @@ class KaimingNormal(MSRAInitializer):
         \frac{gain}{\sqrt{{fan\_in}}}
 
     Args:
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. default is None.
-        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
-        nonlinearity(str, optional): the non-linear function. default is relu.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
+        nonlinearity(str, optional): the non-linear function. Default is relu.
 
     Note:
         It is recommended to set fan_in to None for most cases.
@@ -84,9 +258,9 @@ class KaimingUniform(MSRAInitializer):
         x = gain \times \sqrt{\frac{3}{fan\_in}}
 
     Args:
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. default is None.
-        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
-        nonlinearity(str, optional): the non-linear function. default is relu.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
+        nonlinearity(str, optional): the non-linear function. Default is relu.
 
     Note:
         It is recommended to set fan_in to None for most cases.
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 5ead30f4f1e3e0ac5881f86276229252a73400d5..030ec95940db69432c48759ee326c2f0da17db56 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -12,19 +12,99 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.initializer import NormalInitializer, TruncatedNormalInitializer
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class NormalInitializer(Initializer):
+    """Implements the Random Normal(Gaussian) distribution initializer
+
+    Args:
+        loc (float, optional): mean of the normal distribution. Default is 0.0.
+        scale (float, optional): standard deviation of the normal distribution. Default is 1.0.
+        seed (int, optional): random seed. Default is 0.
+
+    """
+
+    def __init__(self, loc=0.0, scale=1.0, seed=0):
+        assert loc is not None
+        assert scale is not None
+        assert seed is not None
+        super().__init__()
+        self._mean = loc
+        self._std_dev = scale
+        self._seed = seed
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Normal distribution.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(block, framework.Block)
+
+        check_variable_and_dtype(
+            var,
+            "Out",
+            ["uint16", "float16", "float32", "float64"],
+            "guassian_random",
+        )
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        if in_dygraph_mode():
+            place = _current_expected_place()
+            out_var = _C_ops.gaussian(
+                var.shape,
+                self._mean,
+                self._std_dev,
+                self._seed,
+                var.dtype,
+                place,
+            )
+            out_var._share_underline_tensor_to(var)
+            return None
+
+        else:
+            op = block.append_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": var.dtype,
+                    "mean": self._mean,
+                    "std": self._std_dev,
+                    "seed": self._seed,
+                    "use_mkldnn": False,
+                },
+                stop_gradient=True,
+            )
+            var.op = op
+            return op
+
+
 class Normal(NormalInitializer):
     """The Random Normal (Gaussian) distribution initializer.
 
     Args:
-        mean (float, optional): mean of the normal distribution. The default value is 0.0.
-        std (float, optional): standard deviation of the normal distribution. The default value is 1.0.
+        mean (float, optional): mean of the normal distribution. Default is 0.0.
+        std (float, optional): standard deviation of the normal distribution. Default is 1.0.
         name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+            property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
         A parameter initialized by Random Normal (Gaussian) distribution.
@@ -58,12 +138,113 @@ class Normal(NormalInitializer):
         super().__init__(loc=mean, scale=std, seed=0)
 
 
+class TruncatedNormalInitializer(Initializer):
+    """Implements the Random TruncatedNormal(Gaussian) distribution initializer
+
+    Args:
+        loc (float, optional): Mean of the normal distribution. Default is :math:`0.0`.
+        scale (float, optional): Standard deviation of the normal distribution. Default is :math:`1.0`.
+        seed (int, optional): random seed. Default is 0.
+
+    """
+
+    def __init__(self, loc=0.0, scale=1.0, seed=0):
+        assert loc is not None
+        assert scale is not None
+        assert seed is not None
+        super().__init__()
+        self._mean = loc
+        self._std_dev = scale
+        self._seed = seed
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with TruncatedNormal distribution.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype in [core.VarDesc.VarType.FP16, core.VarDesc.VarType.BF16]:
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['truncated_gaussian_random', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            out_var = _C_ops.truncated_gaussian_random(
+                var.shape,
+                self._mean,
+                self._std_dev,
+                self._seed,
+                out_dtype,
+                _current_expected_place(),
+            )
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+
+        else:
+            op = block.append_op(
+                type="truncated_gaussian_random",
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": out_dtype,
+                    "mean": self._mean,
+                    "std": self._std_dev,
+                    "seed": self._seed,
+                },
+                stop_gradient=True,
+            )
+
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+            var.op = op
+            return op
+
+
 class TruncatedNormal(TruncatedNormalInitializer):
     """The truncated normal distribution (Gaussian distribution) initializer.
 
     Args:
-        mean (float, optional): Mean of the normal distribution. The default value is :math:`0.0`.
-        std (float, optional): Standard deviation of the normal distribution. The default value is :math:`1.0`.
+        mean (float, optional): Mean of the normal distribution. Default is :math:`0.0`.
+        std (float, optional): Standard deviation of the normal distribution. Default is :math:`1.0`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 0bbfd9eaaaa86e19429f33ad2f85986b6ac59eb9..65a496f2b10696ddcff15618c5b9cb41269e72e8 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -18,7 +18,7 @@ from paddle.utils import unique_name
 from ...fluid import framework
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.dygraph import no_grad
-from ...fluid.initializer import Initializer
+from .initializer import Initializer
 
 __all__ = []
 
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index 011cb6eff6dfaaf95d9fb101248514fc7453c7fd..cd64a15b7519ed27e1950eeda4f162a722b3ccc2 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -12,17 +12,144 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.initializer import UniformInitializer
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class UniformInitializer(Initializer):
+    """Implements the random uniform distribution initializer
+
+    Args:
+        low (float, optional): Lower boundary of the uniform distribution. Default is :math:`-1.0`.
+        high (float, optional): Upper boundary of the uniform distribution. Default is :math:`1.0`.
+        seed (int, optional): Random seed. Default is 0.
+        diag_num (int, optional): the number of diagonal elements to initialize.
+            If set to 0, diagonal initialization will be not performed. Default is 0.
+        diag_step (int, optional): Step size between two diagonal elements,
+            which is generally the width of the square matrix. Default is 0.
+        diag_val (float, optional): the value of the diagonal element to be initialized,
+            default 1.0. It takes effect only if the diag_num is greater than 0. Default is :math:`1.0`.
+
+    """
+
+    def __init__(
+        self, low=-1.0, high=1.0, seed=0, diag_num=0, diag_step=0, diag_val=1.0
+    ):
+        assert low is not None
+        assert high is not None
+        assert high >= low
+        assert seed is not None
+        assert diag_num is not None
+        assert diag_step is not None
+        assert diag_val is not None
+        if diag_num > 0 or diag_step > 0:
+            assert diag_num > 0 and diag_step > 0
+        super().__init__()
+        self._low = low
+        self._high = high
+        self._seed = seed
+        self._diag_num = diag_num
+        self._diag_step = diag_step
+        self._diag_val = diag_val
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Uniform distribution.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(block, framework.Block)
+        if not in_dygraph_mode():
+            check_variable_and_dtype(
+                var,
+                "Out",
+                ["uint16", "float16", "float32", "float64"],
+                "uniform_random",
+            )
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initializers
+        if var.dtype == core.VarDesc.VarType.FP16:
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['uniform_random', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            out_var = _C_ops.uniform(
+                var.shape,
+                out_dtype,
+                self._low,
+                self._high,
+                self._seed,
+                _current_expected_place(),
+            )
+            if var.dtype == core.VarDesc.VarType.FP16:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            op = block.append_op(
+                type="uniform_random",
+                inputs={},
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": out_dtype,
+                    "min": self._low,
+                    "max": self._high,
+                    "seed": self._seed,
+                    "diag_num": self._diag_num,
+                    "diag_step": self._diag_step,
+                    "diag_val": self._diag_val,
+                },
+                stop_gradient=True,
+            )
+
+            if var.dtype == core.VarDesc.VarType.FP16:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class Uniform(UniformInitializer):
     """The uniform distribution initializer.
 
     Args:
-        low (float, optional): Lower boundary of the uniform distribution. The default value is :math:`-1.0`.
-        high (float, optional): Upper boundary of the uniform distribution. The default value is :math:`1.0`.
+        low (float, optional): Lower boundary of the uniform distribution. Default is :math:`-1.0`.
+        high (float, optional): Upper boundary of the uniform distribution. Default is :math:`1.0`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 35e104edba11130821d9e67bd6e621ba358c5c7e..6d17c029f587c23da2d50a844bd4d20654de4990 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -12,11 +12,183 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.initializer import XavierInitializer
+import math
+
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class XavierInitializer(Initializer):
+    r"""
+    This class implements the Xavier weight initializer from the paper
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
+
+    This initializer is designed to keep the scale of the gradients
+    approximately same in all the layers. In case of Uniform distribution,
+    the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+
+
+    Args:
+        uniform (bool, optional): whether to use uniform ,if False use normal distribution. Default is True.
+        fan_in (float, optional): fan_in for Xavier initialization. If None, it is
+                inferred from the variable. Default is None.
+        fan_out (float, optional): fan_out for Xavier initialization. If None, it is
+                 inferred from the variable. Default is None.
+        seed (int, optional): Random seed. Default is 0.
+
+    Note:
+        It is recommended to set fan_in and fan_out to None for most cases.
+
+    """
+
+    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
+        assert uniform is not None
+        assert seed is not None
+        super().__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._fan_out = fan_out
+        self._seed = seed
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Xavier initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(block, framework.Block)
+        check_variable_and_dtype(
+            var,
+            "Out",
+            ["uint16", "float16", "float32", "float64"],
+            "xavier_init",
+        )
+
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in and fan_out are passed, use them
+        fan_in = f_in if self._fan_in is None else self._fan_in
+        fan_out = f_out if self._fan_out is None else self._fan_out
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == core.VarDesc.VarType.FP16 or (
+            var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+        ):
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['xavier_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            if self._uniform:
+                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                out_var = _C_ops.uniform(
+                    out_var.shape,
+                    out_dtype,
+                    -limit,
+                    limit,
+                    self._seed,
+                    _current_expected_place(),
+                )
+            else:
+                std = math.sqrt(2.0 / float(fan_in + fan_out))
+
+                place = _current_expected_place()
+                out_var = _C_ops.gaussian(
+                    out_var.shape, 0.0, std, self._seed, out_dtype, place
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            if self._uniform:
+                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                op = block.append_op(
+                    type="uniform_random",
+                    inputs={},
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": out_dtype,
+                        "min": -limit,
+                        "max": limit,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+            else:
+                std = math.sqrt(2.0 / float(fan_in + fan_out))
+                op = block.append_op(
+                    type="gaussian_random",
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": out_var.dtype,
+                        "mean": 0.0,
+                        "std": std,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class XavierNormal(XavierInitializer):
     r"""
     This class implements the Xavier weight initializer from the paper
@@ -31,9 +203,9 @@ class XavierNormal(XavierInitializer):
 
     Args:
         fan_in (float, optional): fan_in for Xavier initialization, which is
-                inferred from the Tensor. The default value is None.
+                inferred from the Tensor. Default is None.
         fan_out (float, optional): fan_out for Xavier initialization, which is
-                 inferred from the Tensor. The default value is None.
+                 inferred from the Tensor. Default is None.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -83,9 +255,9 @@ class XavierUniform(XavierInitializer):
 
     Args:
         fan_in (float, optional): fan_in for Xavier initialization, which is
-                inferred from the Tensor. The default value is None.
+                inferred from the Tensor. Default is None.
         fan_out (float, optional): fan_out for Xavier initialization, which is
-                 inferred from the Tensor. The default value is None.
+                 inferred from the Tensor. Default is None.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 2617c76ae6e792dc73e4d17e98a03547cb33d3bb..4bf31ca30ea28f5a880c219b8f06da166d1a9507 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -20,15 +20,20 @@ import numpy as np
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops, framework, in_dynamic_mode
+from paddle.common_ops_import import Variable
 from paddle.fluid.data_feeder import check_type, check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
+from paddle.fluid.framework import (
+    _non_static_mode,
+    default_startup_program,
+    in_dygraph_mode,
+    program_guard,
+)
 from paddle.fluid.layers import control_flow, sequence_lod, utils
 from paddle.fluid.layers.utils import flatten, map_structure
 from paddle.framework import core
 from paddle.nn import Layer
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
-from paddle.static import Variable, default_startup_program, program_guard
 from paddle.tensor.manipulation import tensor_array_to_tensor
 
 from .container import LayerList
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index 23e1e233cc0dcd9d064ac1f6fa0211c2d6961648..82b17c8c05d245e2d95f4f5a2d176aaf0285c8f2 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ from .transform_parameters import (
     vector_to_parameters,
     _stride_column,
 )  # noqa: F401
+from .clip_grad_norm_ import clip_grad_norm_  # noqa: F401
 
 __all__ = [  # noqa
     'weight_norm',
@@ -26,4 +27,5 @@ __all__ = [  # noqa
     'spectral_norm',
     'parameters_to_vector',
     'vector_to_parameters',
+    'clip_grad_norm_',
 ]
diff --git a/python/paddle/nn/utils/clip_grad_norm_.py b/python/paddle/nn/utils/clip_grad_norm_.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a3ecb38b4428259ccb2cbd8faa5a1bf9ebf1ffa
--- /dev/null
+++ b/python/paddle/nn/utils/clip_grad_norm_.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+__all__ = ['clip_grad_norm_']
+
+
+def clip_grad_norm_(
+    parameters,
+    max_norm,
+    norm_type=2.0,
+    error_if_nonfinite=False,
+):
+    r"""Clips gradient norm of the iteratable parameters.
+
+    Norms are calculated together on all gradients, just as they are
+    connected into one vector. The gradient will be modified in place.
+
+    This API can only run in dynamic graph mode, not static graph mode.
+
+    Args:
+        parameters (Iterable[paddle.Tensor] or paddle.Tensor): Tensors or a single Tensor
+            that will be normalized gradients
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be `inf` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, throw an error if the total
+            norm of the gradients from :attr:`parameters` is `nan`,
+            `inf`, or `-inf`.
+
+    Returns:
+        Total norm of the parameter gradients (treated as a single vector).
+    Example:
+        .. code-block:: python
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            max_norm = float(5.0)
+            linear = paddle.nn.Linear(in_features=10, out_features=10)
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            paddle.nn.utils.clip_grad_norm_(linear.parameters(), max_norm)
+
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters())
+            sdg.step()
+    """
+    if not paddle.in_dynamic_mode():
+        raise RuntimeError('this API can only run in dynamic mode.')
+
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+
+    support_norm_type = [float("inf"), 0, 1, 2]
+    if norm_type not in support_norm_type:
+        raise ValueError(f'norm_type only support {support_norm_type}')
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return paddle.to_tensor(0.0)
+    if norm_type == float("inf"):
+        norms = [g.detach().abs().max() for g in grads]
+        total_norm = (
+            norms[0] if len(norms) == 1 else paddle.max(paddle.stack(norms))
+        )
+    else:
+        total_norm = paddle.linalg.norm(
+            paddle.stack(
+                [paddle.linalg.norm(g.detach(), norm_type) for g in grads]
+            ),
+            norm_type,
+        )
+
+    if error_if_nonfinite and paddle.logical_or(
+        total_norm.isnan(), total_norm.isinf()
+    ):
+        raise RuntimeError(
+            f'The total norm of {norm_type} order of the gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. In any case, '
+            'disable this error and scale the gradient by non-finite norm, '
+            'set `error_if_nonfinite=False`'
+        )
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: when the coef is clamped to 1, it is redundant to multiply the clamped coef, but this
+    # avoids the `if clip_coef < 1:` condition.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    with paddle.no_grad():
+        for _, p in enumerate(parameters):
+            g = p.grad
+            if g is not None:
+                p.grad = paddle.multiply(x=g, y=clip_coef_clamped)
+    return total_norm
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index bc5f9020b7f305a95a6294e571b787c5b750b1fe..a6698cfb735eea288c08a3ee13ac46f9b447e9a5 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -391,6 +391,14 @@ class PiecewiseDecay(LRScheduler):
     """
 
     def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
+        if len(boundaries) == 0:
+            raise ValueError('The boundaries cannot be empty.')
+
+        if len(values) <= len(boundaries):
+            raise ValueError(
+                f'The values have one more element than boundaries, but received len(values) [{len(values)}] < len(boundaries) + 1 [{len(boundaries) + 1}].'
+            )
+
         self.boundaries = boundaries
         self.values = values
         super().__init__(last_epoch=last_epoch, verbose=verbose)
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index cad226952be41807801d2e651f725e23879926d8..d9e1cd456042c409cb3694470f16133318cda81d 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -34,7 +34,6 @@ from paddle.fluid.framework import (
 from ..fluid import framework, unique_name
 from ..fluid.backward import _get_no_grad_set_name, append_backward
 from ..fluid.framework import Parameter, program_guard
-from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
 from .lr import LRScheduler
 
@@ -453,7 +452,8 @@ class Optimizer:
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
-                lr_var, initializer=Constant(value=lr_value)
+                lr_var,
+                initializer=paddle.nn.initializer.Constant(value=lr_value),
             )
         elif isinstance(self._learning_rate, float):
             # only create global lr_var once
@@ -726,7 +726,10 @@ class Optimizer:
         else:
             with device_guard(device):
                 self.helper.set_variable_initializer(
-                    var, initializer=Constant(value=float(fill_value))
+                    var,
+                    initializer=paddle.nn.initializer.Constant(
+                        value=float(fill_value)
+                    ),
                 )
 
         if framework._non_static_mode():
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index c43385a8e91409f90535ba428d1b83b9715ec503..ef49b5642a37ca89a0962b297ce98d08c78eb6fa 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -28,9 +28,9 @@ from paddle.common_ops_import import (
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_dtype
 from paddle.fluid.framework import Variable, _non_static_mode, static_only
-from paddle.fluid.initializer import Constant, Normal
 from paddle.fluid.layers.layer_function_generator import templatedoc
 from paddle.fluid.param_attr import ParamAttr
+from paddle.nn.initializer import Constant, Normal
 
 __all__ = []
 
@@ -1012,7 +1012,7 @@ def conv2d(
                 "filter size.".format(filter_elem_num)
             )
         std = (2.0 / filter_elem_num) ** 0.5
-        return Normal(0.0, std, 0)
+        return Normal(0.0, std)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
@@ -1315,7 +1315,7 @@ def conv3d(
             )
 
         std = (2.0 / filter_elem_num) ** 0.5
-        return Normal(0.0, std, 0)
+        return Normal(0.0, std)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
@@ -2244,6 +2244,11 @@ def deformable_conv(
         mask, 'mask', (paddle.static.Variable, type(None)), 'deformable_conv'
     )
 
+    if input.ndim != 4:
+        raise ValueError(
+            f'The input should be of [N, C, H, W] format, but received {input.shape}'
+        )
+
     num_channels = input.shape[1]
     assert param_attr is not False, "param_attr should not be False here."
 
@@ -2281,7 +2286,7 @@ def deformable_conv(
                 "filter size.".format(filter_elem_num)
             )
         std = (2.0 / filter_elem_num) ** 0.5
-        return paddle.nn.initializer.normal.NormalInitializer(0.0, std, 0)
+        return paddle.nn.initializer.normal.Normal(0.0, std)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
@@ -2752,7 +2757,7 @@ def batch_norm(
         attr=helper.param_attr,
         shape=param_shape,
         dtype=dtype,
-        default_initializer=paddle.fluid.initializer.Constant(1.0),
+        default_initializer=paddle.nn.initializer.Constant(1.0),
     )
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
@@ -2761,7 +2766,7 @@ def batch_norm(
     mean = helper.create_parameter(
         attr=paddle.ParamAttr(
             name=moving_mean_name,
-            initializer=paddle.fluid.initializer.Constant(0.0),
+            initializer=paddle.nn.initializer.Constant(0.0),
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var,
         ),
@@ -2773,7 +2778,7 @@ def batch_norm(
     variance = helper.create_parameter(
         attr=paddle.ParamAttr(
             name=moving_variance_name,
-            initializer=paddle.fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var,
         ),
diff --git a/python/paddle/static/nn/loss.py b/python/paddle/static/nn/loss.py
index 20c7641e2d9deb69eb4fa77ba89bc2c290e48eea..41f32e4a63fab32d51b618efc050a06f669e62bf 100644
--- a/python/paddle/static/nn/loss.py
+++ b/python/paddle/static/nn/loss.py
@@ -16,12 +16,12 @@
 import numpy as np
 
 from paddle.fluid.framework import static_only
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 # TODO: define loss functions of neural network
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.layer_function_generator import templatedoc
 from paddle.fluid.param_attr import ParamAttr
+from paddle.nn.initializer import Assign
 
 from ...fluid.data_feeder import check_variable_and_dtype
 
@@ -129,6 +129,11 @@ def nce(
     check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nce')
     check_variable_and_dtype(label, 'label', ['int64'], 'nce')
 
+    if input.ndim != 2:
+        raise ValueError(
+            f'The rank of `input` must be 2, but received {input.ndim}.'
+        )
+
     dim = input.shape[1]
     num_true_class = label.shape[1]
     w = helper.create_parameter(
@@ -209,7 +214,7 @@ def nce(
                 attr=ParamAttr(),
                 shape=numpy_array.shape,
                 dtype=numpy_array.dtype,
-                default_initializer=NumpyArrayInitializer(numpy_array),
+                default_initializer=Assign(numpy_array),
             )
             ret.stop_gradient = True
             return ret
diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py
index 7406525b9df0aa68c6031aa169d4d2cf462d63d6..bcb3cfc130fcd0706602b9ea98b652912f7f0e90 100644
--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
@@ -18,9 +18,9 @@ All layers just related to metric.
 from paddle import _legacy_C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.framework import Variable, _non_static_mode, _varbase_creator
-from paddle.fluid.initializer import Constant
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import tensor
+from paddle.nn.initializer import ConstantInitializer
 
 __all__ = []
 
@@ -266,7 +266,8 @@ def auc(
 
     for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
         helper.set_variable_initializer(
-            var, Constant(value=0.0, force_cpu=False)
+            var,
+            ConstantInitializer(value=0.0, force_cpu=False),
         )
 
     # "InsTagWeight": [ins_tag_weight]
diff --git a/python/paddle/static/quantization/quantization_pass.py b/python/paddle/static/quantization/quantization_pass.py
index 83587563c493088558eb6802615e718064891863..c9094998dfe2455a6dc2aed2acd86e3f8492cc15 100644
--- a/python/paddle/static/quantization/quantization_pass.py
+++ b/python/paddle/static/quantization/quantization_pass.py
@@ -2890,6 +2890,19 @@ class AddQuantDequantPassV2:
                         )
                         if in_node.persistable():
                             continue
+
+                        if in_node.dtype() not in [
+                            paddle.float64,
+                            paddle.float32,
+                            paddle.float16,
+                        ]:
+                            _logger.warning(
+                                "Since the {} contains an input of type INT, the quantization of this layer is skipped.".format(
+                                    op_node.name()
+                                )
+                            )
+                            break
+
                         if arg_name in dequantized_vars_map:
                             dequant_var_node = dequantized_vars_map[arg_name]
                         else:
@@ -3137,7 +3150,7 @@ class QuantWeightPass:
         self._save_int_weight = save_int_weight
         assert self._scope is not None, "scope must not be None."
         assert self._place is not None, "place must not be None."
-        self._quantized_ops = set()
+        self._quantized_ops = {}
 
     def apply(self, graph):
         assert isinstance(
@@ -3176,7 +3189,6 @@ class QuantWeightPass:
                 quant_axis = _op.op().attr("quant_axis")
                 bits_length = _op.op().attr("bit_length")
                 if x_node.name() not in self._quantized_ops:
-                    self._quantized_ops.add(x_node.name())
                     quantized_param_v = utils.quant_tensor(
                         param_v.copy(),
                         scale_v,
@@ -3211,10 +3223,13 @@ class QuantWeightPass:
                         self._scope,
                         self._place,
                     )
+                    self._quantized_ops[x_node.name()] = quant_weight_node
 
                 for next_op_node in out_node.outputs:
                     graph.update_input_link(
-                        out_node, quant_weight_node, next_op_node
+                        out_node,
+                        self._quantized_ops[x_node.name()],
+                        next_op_node,
                     )
                 graph.safe_remove_nodes(_op)
         self._remove_unused_var_nodes(graph)
@@ -3298,9 +3313,9 @@ class AddQuantDequantForInferencePass:
                         op_node.outputs, var_name
                     )
                     if out_node.dtype() not in [
-                        core.VarDesc.VarType.FP64,
-                        core.VarDesc.VarType.FP32,
-                        core.VarDesc.VarType.FP16,
+                        paddle.float64,
+                        paddle.float32,
+                        paddle.float16,
                     ]:
                         continue
                     if var_name in dequantized_vars_map:
@@ -3319,7 +3334,10 @@ class AddQuantDequantForInferencePass:
             else:
                 var_names = utils._get_op_input_var_names(op_node)
                 for var_name in var_names:
-                    if var_name in dequant_node_map:
+                    if (
+                        var_name in dequant_node_map
+                        and dequant_node_map[var_name]
+                    ):
                         in_node = graph._find_node_by_name(
                             op_node.inputs, var_name
                         )
@@ -3345,39 +3363,41 @@ class AddQuantDequantForInferencePass:
             shape=var_node.shape(),
             var_dtype=var_node.dtype(),
         )
-        if not self._calibration_range_dict:
-            try:
-                scale_var_node = graph._find_node_by_name(
-                    graph.all_persistable_nodes(), self._scale_name(var_name)
+
+        try:
+            scale_var_node = graph._find_node_by_name(
+                graph.all_persistable_nodes(), self._scale_name(var_name)
+            )
+        except:
+            if (
+                self._calibration_range_dict
+                and var_name in self._calibration_range_dict
+            ):
+                scale_value = self._calibration_range_dict[var_name]
+                scale_var_node = graph.create_persistable_node(
+                    name=self._scale_name(var_name),
+                    var_type=var_node.type(),
+                    shape=[1],
+                    var_dtype=var_node.dtype(),
                 )
-            except:
+                data_type = (
+                    'float64'
+                    if var_node.dtype() == core.VarDesc.VarType.FP64
+                    else 'float32'
+                )
+                _init_var_node(
+                    scale_var_node,
+                    np.array(scale_value, dtype=data_type),
+                    self._scope,
+                    self._place,
+                )
+            else:
                 _logger.warning(
                     "Cannot find the target node {} in scope, so skip adding quant node.".format(
                         var_name
                     )
                 )
                 return None
-        elif var_name in self._calibration_range_dict:
-            scale_value = self._calibration_range_dict[var_name]
-            scale_var_node = graph.create_persistable_node(
-                name=self._scale_name(var_name),
-                var_type=var_node.type(),
-                shape=[1],
-                var_dtype=var_node.dtype(),
-            )
-            data_type = (
-                'float64'
-                if var_node.dtype() == core.VarDesc.VarType.FP64
-                else 'float32'
-            )
-            _init_var_node(
-                scale_var_node,
-                np.array(scale_value, dtype=data_type),
-                self._scope,
-                self._place,
-            )
-        else:
-            return None
         try:
             zero_point_node = graph._find_node_by_name(
                 graph.all_persistable_nodes(),
diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py
index 70b606c3c6fbea5a35843a7ad2b7931766c6fc89..84fc94b5eec85b91c4d97bc14677ee70473c60c2 100644
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
@@ -14,9 +14,9 @@
 
 # Define functions about array.
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..framework import LayerHelper, core, in_dygraph_mode
-from ..static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 37a1aaf3c86d95785f4798cc3b587b06f887c289..c79c9553c2f0802db750d75c802877cdcbd2a2c6 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -19,10 +19,10 @@ import numpy as np
 import paddle
 from paddle import _C_ops
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.framework import in_dygraph_mode
 from ..framework import LayerHelper, core
-from ..static import Variable
 from .creation import _complex_to_real_dtype, assign
 
 __all__ = []
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 9ad83ba74b7f5c920d96c191eda0c01106b269fc..808e4d86d60320b725eedee6c46ccfc1d991f126 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -35,7 +35,6 @@ from ..fluid.framework import (
     _in_eager_without_dygraph_check,
     device_guard,
 )
-from ..fluid.initializer import Constant, Initializer
 from ..fluid.layers import utils
 from ..fluid.param_attr import ParamAttr
 from ..framework import (
@@ -140,7 +139,10 @@ def create_global_var(
         stop_gradient=True,
     )
     helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value), force_cpu=force_cpu)
+        var,
+        initializer=paddle.nn.initializer.ConstantInitializer(
+            value=float(value), force_cpu=force_cpu
+        ),
     )
 
     return var
@@ -214,7 +216,7 @@ def create_parameter(
     check_type(
         default_initializer,
         'default_initializer',
-        (type(None), Initializer),
+        (type(None), paddle.nn.initializer.Initializer),
         'create_parameter',
     )
 
@@ -533,6 +535,9 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
 
 def _to_tensor_non_static(data, dtype=None, place=None, stop_gradient=True):
 
+    if isinstance(data, np.number):  # Special case for numpy scalars
+        data = np.array(data)
+
     if not isinstance(data, np.ndarray):
 
         def _handle_dtype(data, dtype):
@@ -627,6 +632,8 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
     if isinstance(data, Variable) and (dtype is None or dtype == data.dtype):
         output = data
     else:
+        if isinstance(data, np.number):  # Special case for numpy scalars
+            data = np.array(data)
 
         if not isinstance(data, np.ndarray):
             if np.isscalar(data) and not isinstance(data, str):
@@ -690,6 +697,18 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     If the ``data`` is already a Tensor, copy will be performed and return a new tensor.
     If you only want to change stop_gradient property, please call ``Tensor.stop_gradient = stop_gradient`` directly.
 
+    .. code-block:: text
+
+        We use the dtype conversion rules following this:
+                Keep dtype
+        np.number ───────────► paddle.Tensor
+                                (0D-Tensor)
+                    default_dtype
+        Python Number ───────────────► paddle.Tensor
+                                        (1D-Tensor)
+                    Keep dtype
+        np.ndarray ───────────► paddle.Tensor
+
     Args:
         data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 299e41d2aea94e6428438a0113ef4e7d54119000..6d9c5fe2880579bd4f990810fae2ebff41217010 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -19,6 +19,7 @@ from io import StringIO
 
 from paddle import _C_ops, _legacy_C_ops
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_variable_and_dtype
 from ..fluid.proto import framework_pb2
 from ..framework import (
@@ -28,7 +29,6 @@ from ..framework import (
     core,
     in_dygraph_mode,
 )
-from ..static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 4cce1b01968a196250d9346ec4a4e173e21f6892..de8374d4ce2b4fc25715daf7af4c417cfded8dda 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -18,13 +18,13 @@ import paddle
 from paddle import _C_ops
 from paddle.common_ops_import import VarDesc
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import (
     check_dtype,
     check_type,
     check_variable_and_dtype,
 )
 from ..framework import LayerHelper, in_dygraph_mode
-from ..static import Variable
 from .creation import full
 from .logic import logical_not
 from .manipulation import cast
@@ -1921,6 +1921,7 @@ def svd(x, full_matrices=False, name=None):
             #                  U * UH == I
             #                  V * VH == I
     """
+
     if in_dygraph_mode():
         return _C_ops.svd(x, full_matrices)
     else:
@@ -2323,6 +2324,7 @@ def eig(x, name=None):
             #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
             #         (-0.21026087843552282+0j)])
     """
+
     if in_dygraph_mode():
         return _C_ops.eig(x)
     else:
@@ -3171,13 +3173,26 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     else:
         raise RuntimeError("Only support lstsq api for CPU or CUDA device.")
 
-    if x.dtype == y.dtype and x.dtype in (paddle.float32, paddle.float64):
-        pass
-    else:
+    if not (x.dtype == y.dtype and x.dtype in (paddle.float32, paddle.float64)):
         raise ValueError(
             "Only support x and y have the same dtype such as 'float32' and 'float64'."
         )
 
+    if x.ndim < 2:
+        raise ValueError(
+            f"The shape of x should be (*, M, N), but received ndim is [{x.ndim} < 2]"
+        )
+
+    if y.ndim < 2:
+        raise ValueError(
+            f"The shape of y should be (*, M, K), but received ndim is [{y.ndim} < 2]"
+        )
+
+    if x.shape[-2] != y.shape[-2]:
+        raise ValueError(
+            f"x with shape (*, M = {x.shape[-2]}, N) and y with shape (*, M = {y.shape[-2]}, K) should have same M."
+        )
+
     if rcond is None:
         if x.dtype == paddle.float32:
             rcond = 1e-7 * max(x.shape[-2], x.shape[-1])
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
old mode 100755
new mode 100644
index 375f3614e5e30c827a15f25b21d9897f73805002..ad6c30e319a81b354c2e4c1b4e8ac3ce28ffc936
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -16,9 +16,9 @@
 
 import paddle
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.framework import global_var
-from ..static import Variable
 from .layer_function_generator import templatedoc
 
 if global_var._in_eager_mode_:
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 923e6923d6d63c66af55cb7f360411ce8bf88684..b9feee2fe1dd98513251c75ead51e2264ea49895 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -20,7 +20,7 @@ import paddle
 from paddle import _C_ops
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
-from ..common_ops_import import fill_constant
+from ..common_ops_import import Variable, fill_constant
 from ..fluid.data_feeder import (
     check_dtype,
     check_type,
@@ -35,7 +35,6 @@ from ..framework import (
     dygraph_only,
     in_dygraph_mode,
 )
-from ..static import Variable
 from .creation import _complex_to_real_dtype, _real_to_complex_dtype, zeros
 
 __all__ = []
@@ -2755,14 +2754,19 @@ def unbind(input, axis=0):
             # x2.shape [3, 5]
             # x3.shape [3, 5]
     """
+    if not isinstance(axis, (int)):
+        raise TypeError(
+            "The type of 'axis'  must be int, but received %s." % (type(axis))
+        )
+
+    if axis not in range(-input.ndim, input.ndim):
+        raise ValueError(
+            f'The axis must in range({-input.ndim}, {input.ndim}).'
+        )
+
     if in_dygraph_mode():
         return _C_ops.unbind(input, axis)
     else:
-        if not isinstance(axis, (int)):
-            raise TypeError(
-                "The type of 'axis'  must be int, but received %s."
-                % (type(axis))
-            )
         if isinstance(axis, np.generic):
             axis = np.asscalar(axis)
         input_shape = input.shape
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 81b092f4c38b4e177a47abc496c85ccae79d409c..6f797b82e1d083d02483f9529a2d0111d5db4c76 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -25,6 +25,7 @@ from paddle.common_ops_import import VarDesc, dygraph_only, dygraph_utils
 # TODO: define math functions
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import (
     check_dtype,
     check_type,
@@ -38,7 +39,6 @@ from ..framework import (
     core,
     in_dygraph_mode,
 )
-from ..static import Variable
 from .creation import _complex_to_real_dtype
 from .layer_function_generator import generate_layer_fn, templatedoc
 from .manipulation import cast
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 59958df236131d9ab4d1844f2fb7a97ea7ae80c0..ff48780423fd6317a1c10c7fc9f4a633c2910175 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -16,8 +16,8 @@
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops
+from paddle.common_ops_import import Variable
 from paddle.fluid.framework import _current_expected_place, in_dygraph_mode
-from paddle.static import Variable
 
 from ..fluid.data_feeder import (
     check_dtype,
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 3ec79b55b0bbe61daf8820520d4b07e6b10023a0..e16ac89953fb196bbe075012ac8f3317ff1eb7d6 100755
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -292,7 +292,7 @@ def index_select(x, index, axis=0, name=None):
     size as the length of ``index``; other dimensions have the same size as in the ``x`` tensor.
 
     Args:
-        x (Tensor): The input Tensor to be operated. The data of ``x`` can be one of float32, float64, int32, int64.
+        x (Tensor): The input Tensor to be operated. The data of ``x`` can be one of float16, float32, float64, int32, int64.
         index (Tensor): The 1-D Tensor containing the indices to index. The data type of ``index`` must be int32 or int64.
         axis (int, optional): The dimension in which we index. Default: if None, the ``axis`` is 0.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index cc94aee415541bd92b1c0e565e3a23d3e09a8369..f9784478393dcfaf34485ad0daa9bad6abfc45b0 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -18,9 +18,9 @@ import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import in_dygraph_mode
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..framework import LayerHelper, core
-from ..static import Variable
 from .math import _get_reduce_axis_with_tensor
 from .search import where
 
diff --git a/python/paddle/tests/test_callback_reduce_lr_on_plateau.py b/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
index 9e98ee484105fcc5dc21b919099287db695bc3e0..2333777a2cca61a90929965bb22be3361fa17108 100644
--- a/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
+++ b/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
@@ -88,7 +88,7 @@ class TestReduceLROnPlateau(unittest.TestCase):
 
         optim = paddle.optimizer.Adam(
             learning_rate=paddle.optimizer.lr.PiecewiseDecay(
-                [0.001, 0.0001], [5, 10]
+                [0.001, 0.0001], [5, 10, 10]
             ),
             parameters=net.parameters(),
         )
diff --git a/python/paddle/utils/profiler.py b/python/paddle/utils/profiler.py
index 3ede624bfa57b654cf5131452b4b87ce53e6c8b5..6381ddc1456d137838f62e8361c746a3c874ef96 100644
--- a/python/paddle/utils/profiler.py
+++ b/python/paddle/utils/profiler.py
@@ -19,6 +19,7 @@ from ..fluid import core
 from ..fluid.profiler import cuda_profiler  # noqa: F401
 from ..fluid.profiler import profiler  # noqa: F401
 from ..fluid.profiler import reset_profiler, start_profiler, stop_profiler
+from .deprecated import deprecated
 
 __all__ = [  # noqa
     'Profiler',
@@ -32,6 +33,12 @@ __all__ = [  # noqa
 ]
 
 
+@deprecated(
+    since="2.4.2",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
+)
 class ProfilerOptions:
     def __init__(self, options=None):
         self.options = {
@@ -72,6 +79,12 @@ class ProfilerOptions:
 _current_profiler = None
 
 
+@deprecated(
+    since="2.4.2",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
+)
 class Profiler:
     def __init__(self, enabled=True, options=None):
         if options is not None:
@@ -146,6 +159,12 @@ class Profiler:
                 self.stop()
 
 
+@deprecated(
+    since="2.4.2",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
+)
 def get_profiler():
     global _current_profiler
     if _current_profiler is None:
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 0d43bd0fc54ce894aa9de9ae2c0f79bd24c6d5a0..2cd582884abf4ca5c0f45b59e49c16f51541bc09 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -19,11 +19,11 @@ from paddle.tensor.math import _add_with_axis
 
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.framework import Variable, in_dygraph_mode
-from ..fluid.initializer import Normal
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.layers import utils
 from ..framework import _current_expected_place
 from ..nn import BatchNorm2D, Conv2D, Layer, ReLU, Sequential
+from ..nn.initializer import Normal
 
 __all__ = [  # noqa
     'yolo_loss',
@@ -1120,7 +1120,7 @@ class DeformConv2D(Layer):
         def _get_default_param_initializer():
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
             std = (2.0 / filter_elem_num) ** 0.5
-            return Normal(0.0, std, 0)
+            return Normal(0.0, std)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
diff --git a/tools/coverage/paddle_coverage_new.sh b/tools/coverage/paddle_coverage_new.sh
new file mode 100644
index 0000000000000000000000000000000000000000..98de591fd154b2f6936e193284aaa23bf0fd9b36
--- /dev/null
+++ b/tools/coverage/paddle_coverage_new.sh
@@ -0,0 +1,287 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xe
+
+PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+
+function lcov_init(){
+    # install lcov
+    if [ ! -f "/root/.cache/lcov-1.14.tar.gz" ];then
+        wget -P /home https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz --no-proxy --no-check-certificate || exit 101 
+        cp /home/lcov-1.14.tar.gz /root/.cache/lcov-1.14.tar.gz
+    else
+        cp /root/.cache/lcov-1.14.tar.gz /home/lcov-1.14.tar.gz
+    fi
+    tar -xf /home/lcov-1.14.tar.gz -C /
+    cd /lcov-1.14
+    make install
+}
+
+function gen_cpp_covinfo(){
+    # run paddle coverage
+    cd /paddle/build
+    python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID} || exit 101
+    lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
+}
+
+
+# full html report
+
+function gen_full_html_report() {
+    lcov --extract coverage.info \
+        '/paddle/paddle/fluid/framework/*' \
+        '/paddle/paddle/fluid/imperative/*' \
+        '/paddle/paddle/fluid/inference/*' \
+        '/paddle/paddle/fluid/memory/*' \
+        '/paddle/paddle/fluid/operators/*' \
+        '/paddle/paddle/fluid/recordio/*' \
+        '/paddle/paddle/fluid/string/*' \
+        '/paddle/paddle/fluid/eager/*' \
+        '/paddle/paddle/phi/*' \
+        '/paddle/paddle/utils/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+
+    lcov --remove coverage-full.info \
+        '/paddle/paddle/fluid/framework/*_test*' \
+        '/paddle/paddle/fluid/*/*test*' \
+        '/paddle/paddle/fluid/*/*/*test*' \
+        '/paddle/paddle/fluid/inference/tests/*' \
+        '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        '/paddle/paddle/fluid/eager/tests/*' \
+        '/paddle/paddle/phi/tests/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+}
+
+function gen_full_html_report_xpu() {
+    lcov --extract coverage.info \
+        '/paddle/paddle/fluid/operators/*xpu*' \
+        '/paddle/paddle/phi/kernels/xpu/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+
+    lcov --remove coverage-full.info \
+        '/paddle/paddle/fluid/framework/*_test*' \
+        '/paddle/paddle/fluid/*/*test*' \
+        '/paddle/paddle/fluid/*/*/*test*' \
+        '/paddle/paddle/fluid/inference/tests/*' \
+        '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+}
+
+function gen_full_html_report_npu() {
+    lcov --extract coverage.info \
+        '/paddle/paddle/fluid/operators/*npu*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+
+    lcov --remove coverage-full.info \
+        '/paddle/paddle/fluid/framework/*_test*' \
+        '/paddle/paddle/fluid/*/*test*' \
+        '/paddle/paddle/fluid/*/*/*test*' \
+        '/paddle/paddle/fluid/inference/tests/*' \
+        '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+}
+
+# if [ ${WITH_XPU:-OFF} == "ON" ]; then
+#     gen_full_html_report_xpu || true
+# elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then
+#     gen_full_html_report_npu || true
+# else
+#     gen_full_html_report || true
+# fi
+
+# diff html report
+
+function gen_diff_html_report() {
+    if [ "${GIT_PR_ID}" != "" ]; then
+
+        COVERAGE_DIFF_PATTERN="`python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+
+        python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
+    fi
+
+    lcov --extract coverage-full.info \
+        ${COVERAGE_DIFF_PATTERN} \
+        -o coverage-diff.info \
+        --rc lcov_branch_coverage=0
+
+    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
+
+    mv -f coverage-diff.tmp coverage-diff.info
+
+    genhtml -o coverage-diff -t 'Diff Coverage' --no-function-coverage --no-branch-coverage coverage-diff.info
+}
+
+# gen_diff_html_report || true
+
+function gen_py_covinfo(){
+    # python coverage
+
+    export COVERAGE_FILE=/paddle/build/python-coverage.data
+    coverage combine `$(ls python-coverage.data.*)` || NO_PYTHON_COVERAGE_DATA=1
+    `$(coverage xml -i -o python-coverage.xml)` || [[ "${NO_PYTHON_COVERAGE_DATA}" == "1" ]]
+    sed -i 's/mnt\/paddle/paddle/g' python-coverage.xml
+    `$(python ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info)` || [[ "${NO_PYTHON_COVERAGE_DATA}" == "1" ]]
+}
+
+
+# python full html report
+#
+function gen_python_full_html_report() {
+    lcov --extract python-coverage.info \
+        '/paddle/python/*' \
+        -o python-coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f python-coverage-full.tmp python-coverage-full.info
+
+    lcov --remove python-coverage-full.info \
+        '/*/tests/*' \
+        -o python-coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f python-coverage-full.tmp python-coverage-full.info
+}
+
+# gen_python_full_html_report || true
+
+# python diff html report
+
+function gen_python_diff_html_report() {
+    if [ "${GIT_PR_ID}" != "" ]; then
+        COVERAGE_DIFF_PATTERN="`python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+
+        python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > python-git-diff.out
+    fi
+
+    lcov --extract python-coverage-full.info \
+        ${COVERAGE_DIFF_PATTERN} \
+        -o python-coverage-diff.info \
+        --rc lcov_branch_coverage=0
+
+    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py python-coverage-diff.info python-git-diff.out > python-coverage-diff.tmp
+
+    mv -f python-coverage-diff.tmp python-coverage-diff.info
+
+    genhtml -o python-coverage-diff \
+        -t 'Python Diff Coverage' \
+        --no-function-coverage \
+        --no-branch-coverage \
+        --ignore-errors source \
+        python-coverage-diff.info
+}
+
+# gen_python_diff_html_report || true
+
+# assert coverage lines
+
+function covinfo_combine_full(){
+    if [ -f "other-coverage.info" ];then
+        if [ -f "infer-coverage.info" ];then
+            lcov -a other-coverage.info -a infer-coverage.info -o coverage.info
+        else
+            mv other-coverage.info coverage.info
+        fi
+    elif [ -f "infer-coverage.info" ];then
+        mv infer-coverage.info coverage.info
+    else
+        echo "Cannot found coverage.info"
+    fi
+
+    if [ -f "other-python-coverage-full.info" ];then
+        if [ -f "infer-python-coverage-full.info" ];then
+            lcov -a other-python-coverage-full.info -a infer-python-coverage-full.info -o python-coverage-full.info
+        else
+            mv other-python-coverage-full.info python-coverage-full.info
+        fi
+    elif [ -f "infer-coverage.info" ];then
+        mv infer-python-coverage-full.info python-coverage-full.info
+    else
+        echo "Cannot found python coverage.info"
+    fi  
+}
+
+function cov_rate_judge(){
+    echo "Assert CPP Diff Coverage"
+    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py coverage-diff.info 0.9 || COVERAGE_LINES_ASSERT=1
+
+    echo "Assert Python Diff Coverage"
+
+    if [ ${WITH_XPU:-OFF} == "ON" ]; then
+        echo "XPU has no python coverage!"
+    elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then
+        echo "NPU has no python coverage!"
+    else
+        if [[ python-coverage-diff.info ]];then
+            python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
+        fi
+    fi
+    if [ "$COVERAGE_LINES_ASSERT" = "1" ] || [ "$PYTHON_COVERAGE_LINES_ASSERT" = "1" ]; then
+        echo "exit 9" > /tmp/paddle_coverage.result
+        exit 9
+    fi
+}
+
+function print_usage() {
+    echo -e "\n${RED}Usage${NONE}:
+    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
+
+    echo -e "\n${RED}Options${NONE}:
+    ${BLUE}gen_cov_info${NONE}: generate coverage info
+    ${BLUE}test${NONE}: coverage info combine
+    "
+}
+
+function main () {
+    local CMD=$1
+    lcov_init
+    case $CMD in 
+      gen_cov_info)
+        gen_cpp_covinfo
+        gen_py_covinfo
+        ;;
+      combine_cov_info)
+      covinfo_combine_full
+      gen_diff_html_report
+      gen_python_diff_html_report
+      cov_rate_judge
+        ;;
+      *)
+        print_usage
+        exit 1
+        ;;
+      esac
+}
+
+main $@
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 1bc7727f304af5df814e784a7f2f883d49ae93dd..3452db468b3116e49c9a0c772eb5130f8d283611 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -50,17 +50,20 @@ RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/re
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install setuptools -U
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.10.0/bin/pip3 install setuptools -U
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.10.0/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.10.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz --no-check-certificate https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.release18 b/tools/dockerfile/Dockerfile.release18
index cf343873d943a7fcb96fc0fc15a00819c4c42b78..42b24030c00a86433e403c1ce6a3e0f4a3646354 100644
--- a/tools/dockerfile/Dockerfile.release18
+++ b/tools/dockerfile/Dockerfile.release18
@@ -17,7 +17,9 @@ ENV HOME /root
 # Add bash enhancements
 COPY paddle/scripts/docker/root/ /root/
 
-RUN apt-get update && \
+RUN chmod 777 /tmp
+
+RUN apt-get update --allow-unauthenticated && \
   apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
   apt-get update && \
   apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip \ 
@@ -48,7 +50,7 @@ ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
 
 
 RUN apt-get update && \
-  apt-get install -y python3.7 python3.7-dev && \
+  apt-get install -y python3.7 python3.7-dev python3.7-distutils && \
   mv /usr/bin/python /usr/bin/python.bak && ln -s /usr/bin/python3.7 /usr/bin/python && \
   mv /usr/bin/python3 /usr/bin/python3.bak && ln -s /usr/bin/python3.7 /usr/bin/python3
 
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 4bbe8198556e3d4a5ea06af01fefeea137605c80..5822fa10160b71c3b55a3cb273fbcea1772db788 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -24,7 +24,7 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="3.9.0 3.8.0 3.7.0"
+CPYTHON_VERSIONS="3.10.0 3.9.0 3.8.0 3.7.0"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
@@ -80,11 +80,12 @@ build_cpythons $CPYTHON_VERSIONS
 PY37_BIN=/opt/python/cp37-cp37m/bin
 PY38_BIN=/opt/python/cp38-cp38m/bin
 PY39_BIN=/opt/python/cp39-cp39m/bin
+PY310_BIN=/opt/python/cp310-cp310m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib:$(dirname ${PY39_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib:$(dirname ${PY39_BIN})/lib:$(dirname ${PY310_BIN})/lib"
 
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
@@ -136,11 +137,13 @@ for PYTHON in /opt/python/*/bin/python; do
     # Add matching directory of libpython shared library to library lookup path
     LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
 
-    # Smoke test to make sure that our Pythons work, and do indeed detect as
-    # being manylinux compatible:
-    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
-    # Make sure that SSL cert checking works
-    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+    if [ "$(dirname $(dirname ${PYTHON}))" != "/opt/python/cp310-cp310" ]; then
+        # Smoke test to make sure that our Pythons work, and do indeed detect as
+        # being manylinux compatible:
+        LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
+        # Make sure that SSL cert checking works
+        LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+    fi
 done
 
 # Restore LD_LIBRARY_PATH
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index 70071a9ccb07b8500043789e456c2c736d237bd0..76ad518ae24cce59ae7ccf39a0f2cbe261f21b90 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -92,7 +92,14 @@ function do_cpython_build {
     if [ -e ${prefix}/bin/python3.9 ]; then
         ln -s python3.9 ${prefix}/bin/python
     fi
+    if [ -e ${prefix}/bin/python3.10 ]; then
+        ln -s python3.10 ${prefix}/bin/python
+    fi
     # NOTE Make libpython shared library visible to python calls below
+    if [ -e ${prefix}/bin/python3.10 ]; then
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m pip config set global.trusted-host mirrors.aliyun.com
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m pip config set global.index-url http://mirrors.aliyun.com/pypi/simple/
+    fi
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python get-pip.py
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /
diff --git a/tools/dockerfile/build_scripts/install_cudnn.sh b/tools/dockerfile/build_scripts/install_cudnn.sh
index 0817634fa91afb480d73bf831c4ab3c032ac1708..2310370f223c8fe199b219f138dc581a87131d04 100644
--- a/tools/dockerfile/build_scripts/install_cudnn.sh
+++ b/tools/dockerfile/build_scripts/install_cudnn.sh
@@ -45,4 +45,12 @@ elif [[ "$1" == "cudnn821" && "$VERSION" == "11.2" ]]; then
   cp -r lib64 /usr && cd ../ && \
   rm -f cudnn-11.3-linux-x64-v8.2.1.32.tgz && \
   rm -rf cuda
+elif [[ "$1" == "cudnn841" && "$VERSION" == "11.7" ]]; then
+  wget -q https://paddle-ci.gz.bcebos.com/cudnn/cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz --no-check-certificate
+  tar xJvf cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz && \
+  cd cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive && \
+  cp -r include /usr && \
+  cp -r lib /usr && cd ../ && \
+  rm -f cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz && \
+  rm -rf cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive
 fi
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index 6d44dbb90542fbb1eb7e19c70c7e0a7bea4b27ec..c21267807976d62f59f860df0f679f1f1e38d4b4 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -17,18 +17,18 @@
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
-elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ] || [ "$VERSION" == "11.3" ] || [ "$VERSION" == "11.4" ] || [ "$VERSION" == "11.5" ] || [ "$VERSION" == "11.6" ]; then
+elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ] || [ "$VERSION" == "11.3" ] || [ "$VERSION" == "11.4" ] || [ "$VERSION" == "11.5" ] || [ "$VERSION" == "11.6" ] || [ "$VERSION" == "11.7" ] || [ "$VERSION" == "11.8" ]; then
   if [ -f "/etc/redhat-release" ];then
-    rm -f /usr/local/lib/libnccl.so 
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm
-    rpm -ivh libnccl-2.7.8-1+cuda10.2.x86_64.rpm
-    rpm -ivh libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm
-    rpm -ivh libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm && rm -f libnccl-*
+    rm -f /usr/local/lib/libnccl.so
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.10.3-1+cuda11.4.x86_64.rpm
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-devel-2.10.3-1+cuda11.4.x86_64.rpm
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-static-2.10.3-1+cuda11.4.x86_64.rpm
+    rpm -Fivh libnccl-2.10.3-1+cuda11.4.x86_64.rpm
+    rpm -Fivh libnccl-devel-2.10.3-1+cuda11.4.x86_64.rpm
+    rpm -Fivh libnccl-static-2.10.3-1+cuda11.4.x86_64.rpm && rm -f libnccl-*
     exit 0
   fi
-  DEB="nccl-repo-ubuntu1604-2.7.8-ga-cuda10.2_1-1_amd64.deb"
+  DEB="nccl-repo-ubuntu1804-2.10.3-cuda11.4_1.0-1_amd64.deb"
 elif [ "$VERSION" == "9.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.3.7-ga-cuda9.0_1-1_amd64.deb"
 else
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
index 490b0af5289c5409e16ff077428f61200dc2b6c2..2129f92adbac30cecea00ad50a441cb1fb032848 100644
--- a/tools/dockerfile/build_scripts/install_trt.sh
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -36,6 +36,11 @@ elif [[ "$1" == "trt8406" ]];then
    tar -zxf TensorRT-8.4.0.6.Linux.x86_64-gnu.cuda-11.6.cudnn8.3.tar.gz -C /usr/local
    cp -rf /usr/local/TensorRT-8.4.0.6/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.4.0.6/lib/* /usr/lib/
    rm -f TensorRT-8.4.0.6.Linux.x86_64-gnu.cuda-11.6.cudnn8.3.tar.gz
+elif [[ "$1" == "trt8431" ]];then
+   wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz --no-check-certificate --no-proxy
+   tar -zxf TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz -C /usr/local
+   cp -rf /usr/local/TensorRT-8.4.3.1/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.4.3.1/lib/* /usr/lib/
+   rm -f TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
 elif [[ "$VERSION" == "11.2" ]];then
   wget -q --no-proxy https://paddle-ci.gz.bcebos.com/TRT/TensorRT7-cuda11.1-cudnn8.1.tar.gz --no-check-certificate
   tar -zxf TensorRT7-cuda11.1-cudnn8.1.tar.gz -C /usr/local
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 7b21a5104561863dd42079a22f086b81e8a9a92a..4beb8b3a592adcc45a94c63ff02acbd48737a83b 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -128,6 +128,12 @@ function make_cuda116cudnn840trt8406gcc82() {
   sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp
 }
 
+function make_cuda117cudnn841() {
+  sed 's/<baseimg>/11.7.0-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRun bash build_scripts/install_cudnn.sh cudnn841 \nENV CUDNN_VERSION=8.4.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+  sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp
+}
+
 function main() {
   local CMD=$1 
   case $CMD in
@@ -188,6 +194,9 @@ function main() {
     cuda116cudnn840trt8406gcc82)
       make_cuda116cudnn840trt8406gcc82
      ;;
+    cuda117cudnn841)
+      make_cuda117cudnn841
+     ;;
     *)
       echo "Make dockerfile error, Without this paramet."
       exit 1
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 17ba5b3ee4c32418278a52e809a63868ef615531..16471f2edd8f2b0e9c5abef319f69e04646b0210 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -166,6 +166,33 @@ function make_unbuntu18_cu117_dockerfile(){
   sed -i 's# && rm /etc/apt/sources.list.d/nvidia-ml.list##g' ${dockerfile_name}
 }
 
+function make_ubuntu18_cu112_dockerfile(){
+  dockerfile_name="Dockerfile.cuda11.2_cudnn8.1_trt8.4_gcc8.2_ubuntu18"
+  sed "s#<baseimg>#nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04#g" ./Dockerfile.ubuntu18 >${dockerfile_name}
+  sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-11.2/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name}
+  sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8431#g' ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
+     tar -xzf     hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libsndfile1 zstd pigz libcurl4-openssl-dev gettext zstd ninja-build \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub distro \&\& pip3.8 install PyGithub distro" ${dockerfile_name}
+  sed -i 's#<install_cpu_package>##g' ${dockerfile_name}
+  sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
+    COPY tools/dockerfile/build_scripts /build_scripts \\
+    RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
+    RUN cp gcc  gcc.bak \&\& cp g++  g++.bak \&\& rm gcc \&\& rm g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
+    ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
+}
+
 function main() {
   make_ubuntu_dockerfile
   make_ubuntu_trt7_dockerfile
@@ -173,6 +200,7 @@ function main() {
   make_cinn_dockerfile
   make_ce_framework_dockcerfile
   make_unbuntu18_cu117_dockerfile
+  make_ubuntu18_cu112_dockerfile
 }
 
 main "$@"
diff --git a/tools/dockerfile/ubuntu18_release.sh b/tools/dockerfile/ubuntu18_release.sh
index 2c12d4b74c0731dd98ad65d261962eae36017e6c..4c4cc780ce8be52658bfa440641fe1b34374f8f1 100755
--- a/tools/dockerfile/ubuntu18_release.sh
+++ b/tools/dockerfile/ubuntu18_release.sh
@@ -80,7 +80,11 @@ function install_whl(){
 
 function set_cuda_env(){
   if [[ ${WITH_GPU} == "ON" ]]; then
-      sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-${ref_CUDA_MAJOR}/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" Dockerfile.tmp
+      sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-${ref_CUDA_MAJOR}/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH \\
+\\
+RUN apt-key del 7fa2af80 \\
+RUN rm /etc/apt/sources.list.d/cuda.list \&\& rm /etc/apt/sources.list.d/nvidia-ml.list \\
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub #g" Dockerfile.tmp
   else
       sed -i 's#<setcuda>##g' Dockerfile.tmp
   fi