diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index f7b7c73d4deed1affa69a794ea6b940d12e0a589..9cdc0e127c8d5177c819a5b4a97378e3c6eb77ab 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -195,6 +195,7 @@ function(create_dummy_static_lib TARGET_NAME)
   # the dummy target would be consisted of limit size libraries
   set(limit ${merge_LIMIT})
   list(LENGTH merge_LIBS libs_len)
+  message("libs_len ${libs_len}")
   foreach(lib ${merge_LIBS})
     list(APPEND merge_list ${lib})
     list(LENGTH merge_list listlen)
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index b54f45363a00daa71b6de6ac87d9657fa0ff2c29..26b6fce08a40c6a0334f11ac9dbb91778c9aabf5 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -739,6 +739,14 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
             self.backward_returns_list,
         ) = ParseYamlBackward(backward_args_str, backward_returns_str)
 
+        # Remove the output which is intermediate
+        if 'intermediate' in grad_api_contents:
+            backward_returns_list_new = []
+            for return_item in self.backward_returns_list:
+                if return_item[0] not in grad_api_contents['intermediate']:
+                    backward_returns_list_new.append(return_item)
+            self.backward_returns_list = backward_returns_list_new
+
     def CollectForwardInfoFromBackwardContents(self):
 
         backward_forward_str = self.backward_forward_str
@@ -1979,7 +1987,6 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
                         fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyGradInput(&grads[{fwd_position}], input_metas[{fwd_position}]);\n"
 
         inplace_grad_input_str = ""
-        inplaced_tensor_wrapper = False
         inplace_check_str = ""
         optional_inplace_var_name = []
         # Grad Ins from TensorWrappers
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 3c2e8bf85a7992a9759f265dccaea4d5056889bc..06323119a7dc64a54682b201ec30d2c0cf03872b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -105,6 +105,7 @@ pass_library(delete_fill_constant_op_pass inference)
 pass_library(constant_folding_pass inference)
 pass_library(auto_mixed_precision_pass inference)
 pass_library(conv2d_fusion_layout_transfer_pass inference)
+pass_library(silu_fuse_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
 pass_library(skip_layernorm_fuse_pass base)
@@ -429,10 +430,6 @@ if(WITH_MKLDNN)
     test_conv_batch_norm_mkldnn_fuse_pass
     SRCS mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
     DEPS ${TEST_CONV_BN_PASS_DEPS})
-  cc_test(
-    test_scale_matmul_fuse_pass
-    SRCS mkldnn/scale_matmul_fuse_pass_tester.cc
-    DEPS scale_matmul_fuse_pass)
   cc_test(
     test_mkldnn_placement_pass
     SRCS mkldnn/mkldnn_placement_pass_tester.cc
diff --git a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
index efed7dd6e637bc7e9421b3d4afb2090a1c47336c..dd4e0735600bec2a560a00faf338a944bacf702e 100644
--- a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
+++ b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
@@ -143,10 +143,16 @@ void Conv2dFusionLayoutTransferPass::ApplyImpl(ir::Graph *graph) const {
       static_cast<phi::DataType>(Get<int>("model_precision")) ==
           phi::DataType::FLOAT16 ||
       Get<bool>("enable_gpu_mixed");
-  bool cutlass_enable = false;
+  bool cutlass_enable = Get<bool>("use_cutlass");
 
 #ifdef PADDLE_WITH_CUTLASS
-  cutlass_enable = true;
+  const auto &prop = platform::GetDeviceProperties(Get<int>("gpu_device_id"));
+  int sm_version = prop.major * 10 + prop.minor;
+  // Now we only implement cutlass kernel on SM75.
+  if (sm_version == 75) {
+  } else {
+    cutlass_enable = false;
+  }
 #endif
 
   if (!(is_fp16_precision && cutlass_enable)) return;
@@ -184,10 +190,21 @@ void Conv2dFusionLayoutTransferPass::ApplyImpl(ir::Graph *graph) const {
     auto filter_names = op_node->Op()->Input("Filter");
     auto act_type = op_node->Op()->GetAttrIfExists<std::string>("activation");
     constexpr int CUTLASS_NHWC_ALIGNMENT = 8;
-    std::unordered_set<std::string> cutlass_act_set = {
+    // conv2d_fusion has two forms: conv + bias + act, conv + bias +
+    // elmentwise_add + act.
+    std::unordered_set<std::string> cutlass_cba_act_set = {
         "relu", "swish", "identity", "leaky_relu"};
-    if (!cutlass_act_set.count(act_type)) {
-      return false;
+    std::unordered_set<std::string> cutlass_cbaa_act_set = {"relu"};
+    bool is_residual = op_node->Op()->Input("ResidualData").size() >= 1UL;
+
+    if (is_residual) {
+      if (!cutlass_cbaa_act_set.count(act_type)) {
+        return false;
+      }
+    } else {
+      if (!cutlass_cba_act_set.count(act_type)) {
+        return false;
+      }
     }
 
     // If filter's channel is not multiple of 8, conv2d_fusion not run at nhwc.
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
index 324f707af1ec5d9e1a9a6fcc2d4704026e84df2c..021d372c2c89aa64863b8731430360e923285720 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
@@ -32,7 +32,11 @@ void AddVarToScope(Scope* param_scope,
                    const DDim& dims) {
   auto* tensor = param_scope->Var(name)->GetMutable<phi::DenseTensor>();
   tensor->Resize(dims);
-  tensor->mutable_data<float>(platform::CPUPlace());
+  auto* data = tensor->mutable_data<float>(platform::CPUPlace());
+  int64_t numel = tensor->numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    data[i] = 0;
+  }
 }
 
 Scope* CreateParamScope() {
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index 2f527ff1e707bb986aef0da8d721ab8920d6d048..ba18b04d9d04576532a786e940efc02b6d349fd3 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -167,14 +167,19 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
           phi::DataType::FLOAT16 ||
       Get<bool>("enable_gpu_mixed");
   constexpr int CUTLASS_NHWC_ALIGNMENT = 8;
-  if (is_fp16_precision) {
+  bool cutlass_enable = Get<bool>("use_cutlass");
+  if (is_fp16_precision && cutlass_enable) {
 #ifdef PADDLE_WITH_CUTLASS
-    // cutlass now support these activations
-    // cutlass_act_set.insert("swish");
-    // cutlass_act_set.insert("relu");
-    // cutlass_act_set.insert("identity");
-    // cutlass_act_set.insert("leaky_relu");
-
+    const auto& prop = platform::GetDeviceProperties(Get<int>("gpu_device_id"));
+    int sm_version = prop.major * 10 + prop.minor;
+    // Now we only implement cutlass kernel on SM75.
+    if (sm_version == 75) {
+      // Cutlass now support these cba activations.
+      cutlass_act_set.insert("swish");
+      cutlass_act_set.insert("relu");
+      cutlass_act_set.insert("identity");
+      cutlass_act_set.insert("leaky_relu");
+    }
     all_act_set.insert(cutlass_act_set.begin(), cutlass_act_set.end());
 #endif
   }
@@ -198,8 +203,8 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* filter_var = scope->FindLocalVar(conv_filter->Name());
     auto* filter_tensor = filter_var->GetMutable<phi::DenseTensor>();
     CHECK_EQ(filter_tensor->dims().size() == 4UL, true);
-    // when this conv2d_fusion problem size is not supported by cutlass and not
-    // supported by cuDNN, we should not apply this pass
+    // When this conv2d_fusion problem size is not supported by cutlass and not
+    // supported by cuDNN, we should not apply this pass.
     int oc = filter_tensor->dims()[0];
     int ic = filter_tensor->dims()[1];
     bool cutlass_can_fuse = oc % CUTLASS_NHWC_ALIGNMENT == 0 &&
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
deleted file mode 100644
index ed6e63615f7c35f22264aea00ca7dc7d80b97abf..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog,
-           const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs,
-           float scale = 1.0f,
-           float bias = 0.0f) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-
-  op->SetType(type);
-  if (type == "scale") {
-    op->SetInput("X", {inputs[0]});
-    op->SetAttr("scale", scale);
-    op->SetAttr("bias", bias);
-  } else if (type == "matmul") {
-    op->SetAttr("transpose_X", false);
-    op->SetAttr("transpose_Y", false);
-    op->SetInput("X", {inputs[0]});
-    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
-    op->SetAttr("alpha", scale);
-  } else {
-    FAIL() << "Unexpected operator type.";
-  }
-  op->SetOutput("Out", {outputs[0]});
-}
-
-// a->scale->b
-// (b,c)->matmul->d
-ProgramDesc BuildProgramDesc(float scale, float bias, float alpha) {
-  ProgramDesc prog;
-
-  for (auto& v : std::vector<std::string>({"a", "b", "c", "d"})) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "scale", {"a"}, {"b"}, scale, bias);
-  SetOp(&prog, "matmul", {"b", "c"}, {"d"}, alpha);
-  return prog;
-}
-
-void MainTest(const ProgramDesc& prog,
-              int removed_nodes_count,
-              const std::vector<std::string> scale_in_out,
-              const std::vector<std::string> matmul_in_out,
-              float alpha) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num = graph->Nodes().size();
-  auto pass = PassRegistry::Instance().Get("scale_matmul_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-  int current_nodes_num = graph->Nodes().size();
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "scale") {
-        EXPECT_EQ(op->Input("X")[0], scale_in_out[0]);
-        EXPECT_EQ(op->Output("Out")[0], scale_in_out[1]);
-      } else if (op->Type() == "matmul") {
-        EXPECT_EQ(op->Input("X")[0], matmul_in_out[0]);
-        EXPECT_EQ(op->Input("Y")[0], matmul_in_out[1]);
-        EXPECT_EQ(op->Output("Out")[0], matmul_in_out[2]);
-        EXPECT_EQ(op->GetAttrIfExists<float>("alpha"), alpha);
-      }
-    }
-  }
-  EXPECT_EQ(original_nodes_num - removed_nodes_count, current_nodes_num);
-}
-
-TEST(ScaleMatmulFusePass, scale_matmul_with_no_bias) {
-  auto bias = 0.0f;
-  auto scale = 2.34f;
-  auto alpha = 3.45f;
-  int removed_nodes_count = 2;
-  MainTest(BuildProgramDesc(scale, bias, alpha),
-           removed_nodes_count,
-           {},
-           {"a", "c", "d"},
-           scale * alpha);
-}
-
-TEST(ScaleMatmulFusePass, scale_matmul_with_bias) {
-  auto bias = 1.0f;
-  auto scale = 2.34f;
-  auto alpha = 3.45f;
-  int removed_nodes_count = 0;
-  MainTest(BuildProgramDesc(scale, bias, alpha),
-           removed_nodes_count,
-           {"a", "b"},
-           {"b", "c", "d"},
-           alpha);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(scale_matmul_fuse_pass);
diff --git a/paddle/fluid/framework/ir/silu_fuse_pass.cc b/paddle/fluid/framework/ir/silu_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05817968b45c6fccdf0733ee67d7fa881c7ee99c
--- /dev/null
+++ b/paddle/fluid/framework/ir/silu_fuse_pass.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/silu_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SiluFusePass::ApplyImpl(ir::Graph* graph) const {
+  // This pass is used for cutlass, because cutlass can fuse conv + bias + silu
+  bool cutlass_enable = Get<bool>("use_cutlass");
+  if (!cutlass_enable) {
+    return;
+  }
+
+  const std::string pattern_name = "silu_fuse";
+  FusePassBase::Init(pattern_name, graph);
+
+  GraphPatternDetector gpd;
+
+  auto* sigmoid_in = gpd.mutable_pattern()->NewNode("sigmoid_in");
+  auto sigmoid_op =
+      gpd.mutable_pattern()->NewNode("sigmoid_op")->assert_is_op("sigmoid");
+  auto sigmoid_out = gpd.mutable_pattern()
+                         ->NewNode("sigmoid_out")
+                         ->assert_is_op_output("sigmoid")
+                         ->AsIntermediate();
+  auto elementwise_mul_op = gpd.mutable_pattern()
+                                ->NewNode("elementwise_mul_op")
+                                ->assert_is_op("elementwise_mul");
+
+  auto elementwise_mul_out = gpd.mutable_pattern()
+                                 ->NewNode("elementwise_mul_out")
+                                 ->assert_is_op_output("elementwise_mul")
+                                 ->AsOutput();
+
+  sigmoid_op->LinksFrom({sigmoid_in}).LinksTo({sigmoid_out});
+  elementwise_mul_op->LinksFrom({sigmoid_in, sigmoid_out})
+      .LinksTo({elementwise_mul_out});
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    Node* sigmoid_in_node = subgraph.at(sigmoid_in);
+    Node* sigmoid_op_node = subgraph.at(sigmoid_op);
+    Node* elementwise_mul_op_node = subgraph.at(elementwise_mul_op);
+    Node* elementwise_mul_out_node = subgraph.at(elementwise_mul_out);
+
+    OpDesc new_desc;
+    new_desc.SetType("swish");
+    new_desc.SetAttr("beta", 1.f);
+    new_desc.SetInput("X", {sigmoid_in_node->Name()});
+    new_desc.SetOutput("Out", {elementwise_mul_out_node->Name()});
+    new_desc.Flush();
+
+    std::unordered_set<const Node*> del_node_set;
+    del_node_set.insert(sigmoid_op_node);
+    del_node_set.insert(elementwise_mul_op_node);
+    GraphSafeRemoveNodes(graph, del_node_set);
+
+    auto fused_node = graph->CreateOpNode(&new_desc);
+    IR_NODE_LINK_TO(sigmoid_in_node, fused_node);
+    IR_NODE_LINK_TO(fused_node, elementwise_mul_out_node);
+  };
+  gpd(graph, handler);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(silu_fuse_pass, paddle::framework::ir::SiluFusePass);
diff --git a/paddle/fluid/framework/ir/silu_fuse_pass.h b/paddle/fluid/framework/ir/silu_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..6098c6c9b0bcebb49ca92cfdbe3bd62f50653f34
--- /dev/null
+++ b/paddle/fluid/framework/ir/silu_fuse_pass.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+class SiluFusePass : public FusePassBase {
+ public:
+  virtual ~SiluFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 991476dff55b9a4c7e8ff3c1093e18e0b3380fc1..dcb822afb4cadbfeac111e53cbd3cecedf05c77d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1603,11 +1603,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 #endif
 
-  auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx);
   // using cache
   if (kernel_type_.get()) {
     dev_ctx = pool.Get(kernel_type_->place_);
   }
+  auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx);
 
 // TODO(Liu-xiandong): Now we are using too much if-else and hard code in XPU
 // device, it's ugly, and we will refactor in the future.
@@ -2716,22 +2716,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
       static_cast<proto::VarType::Type>(-1);
   proto::VarType::Type data_type = dafault_data_type;
 
-  auto in_name_list = ctx.InNameList();
-  if (Info().HasOpProtoAndChecker()) {
-    for (auto& attr : Info().Proto().attrs()) {
-      auto it =
-          std::find_if(in_name_list.begin(),
-                       in_name_list.end(),
-                       [&attr](const std::string* name) {
-                         return attr.support_tensor() && *name == attr.name();
-                       });
-      if (it != in_name_list.end()) {
-        in_name_list.erase(it);
-      }
-    }
-  }
-
-  for (auto* name : in_name_list) {
+  for (auto* name : ctx.InNameList()) {
     if (ctx.InputSize(*name) == 1UL) {
       ParseInputDataType(ctx.InputVar(*name), *name, &data_type);
     } else {
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 002eb29b776ea083534f4db85c7ad8e2813356cd..f8a4df0617190c539dbf863e92e2bbe331dbdd43 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -202,6 +202,7 @@ struct Argument {
 
   // Passed from config.
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
+  DECL_ARGUMENT_FIELD(use_cutlass, UseCutlass, bool);
   DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
 
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index c184d94ba7fdf7b9890313a4f8c068c9066483fd..ed82dfbaa04e7d95b3306c9e4fe37ead37bba1f4 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -52,6 +52,7 @@ void IRPassManager::CreatePasses(Argument *argument,
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
     pass->Set("use_varseqlen", new bool(argument->tensorrt_use_varseqlen()));
+    pass->Set("use_cutlass", new bool(argument->use_cutlass()));
     pass->Set("with_interleaved",
               new bool(argument->tensorrt_with_interleaved()));
     pass->Set("tensorrt_transformer_posid",
@@ -80,6 +81,10 @@ void IRPassManager::CreatePasses(Argument *argument,
     pass->Set("optim_shape_tensor",
               new std::map<std::string, std::vector<int>>());
 
+    // This gpu_device_id is used by some fp16 precision passes, so move it
+    // here.
+    pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+
     // tuned trt dynamic_shape
     pass->Set("trt_tuned_dynamic_shape",
               new bool(argument->tensorrt_tuned_dynamic_shape()));
@@ -198,7 +203,6 @@ void IRPassManager::CreatePasses(Argument *argument,
             "model_opt_cache_dir",
             new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
       }
-      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
       pass->Set("use_static_engine", new bool(use_static_engine));
       pass->Set("model_from_memory", new bool(argument->model_from_memory()));
       pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
old mode 100644
new mode 100755
index 2ff82986e945caf3ecd0ee91bac02c9a9ad48272..40a8c5ce66a2a5b7c5f54784abdcbdc2c9e3e531
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -222,6 +222,51 @@ void MakeSimpleReusePlan(
   }
 }
 
+// Remove the inplace operation from the plan because it does not support memory
+// reuse
+void DelInplaceOpFromPlan(
+    Graph* graph,
+    std::unordered_map<std::string, std::string>* node2cluster,
+    int sort_kind) {
+  auto topo_nodes = TopologyVarientSort(
+      *graph, static_cast<framework::ir::SortKind>(sort_kind));
+  for (auto* op_node : topo_nodes) {
+    if (!op_node->IsOp()) continue;
+    auto input_tensors = op_node->inputs;
+    auto output_tensors = op_node->outputs;
+
+    std::unordered_set<std::string> in_names;
+    for (const Node* node : input_tensors) {
+      if (!node->Var()) continue;
+      if (node->Var()->Persistable()) continue;
+      std::string var = node->Name();
+      in_names.insert(var);
+    }
+
+    for (const Node* node : output_tensors) {
+      if (!node->Var()) continue;
+      if (node->Var()->Persistable()) continue;
+      std::string var = node->Name();
+      if (in_names.find(var) != in_names.end()) {
+        // delete key
+        if (node2cluster->count(var)) {
+          node2cluster->erase(var);
+        }
+        // delete value
+        std::string tmp_name = "";
+        for (auto it = node2cluster->begin(); it != node2cluster->end(); ++it) {
+          if (it->second == var) {
+            if (tmp_name == "") {
+              tmp_name = it->first;
+            }
+            it->second = tmp_name;
+          }
+        }
+      }
+    }
+  }
+}
+
 // NOTE The optimized opdesc doesn't match ir::Graph.
 void UpdateOpDescsByReuse(
     Graph* graph,
@@ -324,6 +369,7 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   CollectLifeCycle(graph, &lifecycles, sort_kind);
   CollectVarMemorySize(graph, &space_table);
   MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
+  DelInplaceOpFromPlan(graph, &node2cluster, sort_kind);
 
   auto* pass_res_info = PassResultInfoForRuntime::Instance();
   pass_res_info->Set(
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 246cfc44e81dc4b7e2d556cceb7901f07bffb0b6..5d71c7cee1d4356b3475eca7a5187175b3b16165 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -115,6 +115,17 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
   Update();
 }
 
+void AnalysisConfig::Exp_EnableUseCutlass() {
+#if defined(PADDLE_WITH_CUTLASS)
+  use_cutlass_ = true;
+#else
+  LOG(ERROR) << "Please compile with cutlass to EnableUseCutlass()";
+  use_cutlass_ = false;
+#endif
+
+  Update();
+}
+
 void AnalysisConfig::SetExecStream(void *stream) {
   PADDLE_ENFORCE_NOT_NULL(
       stream,
@@ -389,6 +400,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_fc_padding_);
   // GPU related.
   CP_MEMBER(use_gpu_);
+  CP_MEMBER(use_cutlass_);
   CP_MEMBER(use_external_stream_);
   CP_MEMBER(exec_stream_);
   CP_MEMBER(use_cudnn_);
@@ -1249,6 +1261,7 @@ std::string AnalysisConfig::Summary() {
   // gpu info
   os.InsertRow({"use_gpu", use_gpu_ ? "true" : "false"});
   if (use_gpu_) {
+    os.InsertRow({"use_cutlass", use_cutlass_ ? "true" : "false"});
     os.InsertRow({"gpu_device_id", std::to_string(gpu_device_id_)});
     os.InsertRow({"enable_gpu_mixed", std::to_string(enable_gpu_mixed_)});
     os.InsertRow({"memory_pool_init_size",
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2fe3dbe13e71afa1ca4ead1495490507cc4a16bc..0fb11279ebdf9cb78b316acfcaa2e08d73048b6b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1088,6 +1088,7 @@ void AnalysisPredictor::PrepareArgument() {
   // Init std::unique_ptr argument_.
   argument_.reset(new Argument);
   argument_->SetUseGPU(config_.use_gpu());
+  argument_->SetUseCutlass(config_.use_cutlass_);
   argument_->SetUseFcPadding(config_.use_fc_padding());
   argument_->SetGPUDeviceId(config_.gpu_device_id());
   argument_->SetEnableIrOptim(config_.enable_ir_optim_);
@@ -2396,6 +2397,7 @@ USE_TRT_CONVERTER(cast)
 USE_TRT_CONVERTER(recover_padding)
 USE_TRT_CONVERTER(remove_padding)
 USE_TRT_CONVERTER(equal);
+USE_TRT_CONVERTER(not_equal);
 USE_TRT_CONVERTER(top_k)
 USE_TRT_CONVERTER(top_k_v2)
 USE_TRT_CONVERTER(range)
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 41eea1fb98c319b4a70e2a961194df55fee4f35d..0adeaf356de0ac2a131de1e8845a2e6d66a0b44b 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -395,6 +395,12 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool use_gpu() const { return use_gpu_; }
   ///
+  /// \brief When running the fp16 model on Nvidia GPU, you can also try running
+  /// your model on cutlass.
+  ///
+  void Exp_EnableUseCutlass();
+  ///
+  ///
   /// \brief A boolean state telling whether the XPU is turned on.
   ///
   /// \return bool Whether the XPU is turned on.
@@ -1047,6 +1053,7 @@ struct PD_INFER_DECL AnalysisConfig {
 
   // GPU related.
   bool use_gpu_{false};
+  bool use_cutlass_{false};
   int gpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
   bool enable_gpu_mixed_{false};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 46eca6df552c6fd7705a2c1e8a70d75a28c6d8e7..b4018d883a028d11b116e8d33d9d846eafff807e 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -164,6 +164,7 @@ const std::vector<std::string> kLiteSubgraphPasses({
 const std::vector<std::string> kGpuLowerPrecisionPasses{
     "identity_scale_op_clean_pass",
     "simplify_with_basic_ops_pass",
+    "silu_fuse_pass",
     "delete_quant_dequant_linear_op_pass",
     "delete_weight_dequant_linear_op_pass",
     "map_depthwise_conv_to_conv_pass",
@@ -172,6 +173,7 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
     "conv_elementwise_add_act_fuse_pass",
     "conv_elementwise_add2_act_fuse_pass",
     "conv_elementwise_add_fuse_pass",
+    "conv2d_fusion_layout_transfer_pass",
     "multihead_matmul_fuse_pass_v2",
     "fused_multi_transformer_encoder_pass",
     "fused_multi_transformer_decoder_pass",
@@ -216,6 +218,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "delete_weight_dequant_linear_op_pass",                         //
         "map_depthwise_conv_to_conv_pass",                              //
         "constant_folding_pass",                                        //
+        "silu_fuse_pass",                                               //
         "conv_bn_fuse_pass",                                            //
         "conv_eltwiseadd_bn_fuse_pass",                                 //
         "embedding_eltwise_layernorm_fuse_pass",                        //
@@ -250,7 +253,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
 #endif                                         //
         "transpose_flatten_concat_fuse_pass",  //
         "constant_folding_pass",               //
-        "auto_mixed_precision_pass",           //
+        "conv2d_fusion_layout_transfer_pass",  //
+        "auto_mixed_precision_pass"
   });
 
   use_gpu_ = true;
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index b8f9b22fc7b2b1d184f31c13cf9e752443b0510e..314e5390bde8272c3ba585c06913230baef0a3cc 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -142,7 +142,8 @@ void ConvertConv2d(TensorRTEngine* engine,
       layer,
       platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
                               " layer failed."));
-  layer->setStride(nv_strides);
+  layer->setStrideNd(nv_strides);
+
   layer->setPrePadding(nv_pre_paddings);
   if (output_padding.size() > 0) {
     nv_post_paddings.d[0] -= output_padding[0];
@@ -189,7 +190,7 @@ class Conv2dOpConverter : public OpConverter {
             TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
           auto* layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                             Convolution,
+                                             ConvolutionNd,
                                              *inputs,
                                              n_output,
                                              ksize,
diff --git a/paddle/fluid/inference/tensorrt/convert/equal_op.cc b/paddle/fluid/inference/tensorrt/convert/equal_op.cc
index 3a9627dc99a5c36800d2dea7aff152630b0d8706..d1b4b1c08c81b5100fe74471e61919bf74ab343e 100644
--- a/paddle/fluid/inference/tensorrt/convert/equal_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/equal_op.cc
@@ -35,7 +35,6 @@ class EqualOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
-#if IS_TRT_VERSION_GE(8000)
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
@@ -79,11 +78,62 @@ class EqualOpConverter : public OpConverter {
     layer = TRT_ENGINE_ADD_LAYER(
         engine_, ElementWise, *X, *Y, nvinfer1::ElementWiseOperation::kEQUAL);
     RreplenishLayerAndOutput(layer, "equal", {output_name}, test_mode);
-#else
-    PADDLE_THROW(
-        platform::errors::Fatal("ElementWise Equal Operation is only supported "
-                                "on TRT 8 or higher version."));
-#endif
+  }
+};
+
+class NotEqualOpConverter : public OpConverter {
+ public:
+  NotEqualOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+
+    int axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis"));
+    if (axis < 0) {
+      axis = std::abs(dims_x.nbDims - dims_y.nbDims);
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    if (dims_x.nbDims > dims_y.nbDims) {
+      nvinfer1::Dims expand_shape;
+      expand_shape.nbDims = dims_x.nbDims;
+      for (int i = 0; i < expand_shape.nbDims; i++) {
+        expand_shape.d[i] = 1;
+      }
+      for (int i = 0; i < dims_y.nbDims; i++) {
+        expand_shape.d[i + axis] = dims_y.d[i];
+      }
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *Y);
+      expand_layer->setReshapeDimensions(expand_shape);
+      Y = expand_layer->getOutput(0);
+    } else if (dims_x.nbDims < dims_y.nbDims) {
+      nvinfer1::Dims expand_shape;
+      expand_shape.nbDims = dims_y.nbDims;
+      for (int i = 0; i < expand_shape.nbDims; i++) {
+        expand_shape.d[i] = 1;
+      }
+      for (int i = 0; i < dims_x.nbDims; i++) {
+        expand_shape.d[i + axis] = dims_x.d[i];
+      }
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      expand_layer->setReshapeDimensions(expand_shape);
+      X = expand_layer->getOutput(0);
+    }
+
+    layer = TRT_ENGINE_ADD_LAYER(
+        engine_, ElementWise, *X, *Y, nvinfer1::ElementWiseOperation::kEQUAL);
+
+    layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Unary, *layer->getOutput(0), nvinfer1::UnaryOperation::kNOT);
+
+    RreplenishLayerAndOutput(layer, "not_equal", {output_name}, test_mode);
   }
 };
 
@@ -92,3 +142,4 @@ class EqualOpConverter : public OpConverter {
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(equal, EqualOpConverter);
+REGISTER_TRT_OP_CONVERTER(not_equal, NotEqualOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index e5e344e16cbb34379945d3e45fff64deda3800b8..66bfe56f355d9026bf6f648a376da06f147e6a45 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -119,24 +119,21 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
     }
 
-    // In static shape mode in TRT, we can't allow that op's input is a
-    // 1D-tensor So we filter it here. Some op like elementwise having "Y" too,
-    // but that is dealt with in the specified op, here just the common case
+    // In static shape in Paddle-TRT, we can't allow that one op has a
+    // 1D intermediate tensor as input.
     if (!with_dynamic_shape) {
-      std::string X_name;
       auto inputs = desc.Inputs();
-      if (inputs.count("X") && !desc.Input("X").empty()) {
-        X_name = desc.Input("X")[0];
-      } else if (inputs.count("Input") && !desc.Input("Input").empty()) {
-        X_name = desc.Input("Input")[0];
-      }
-      auto* block = desc.Block();
-      if (block) {
-        auto* x_var_desc = block->FindVar(X_name);
-        // Can't get feed op's TensorDesc
-        if (op_type != "feed" && x_var_desc && !x_var_desc->Persistable()) {
-          const auto x_shape = x_var_desc->GetShape();
-          if (x_shape.size() == 1) return false;
+      for (auto iter : inputs) {
+        for (auto var_name : iter.second) {
+          auto* block = desc.Block();
+          if (block) {
+            auto* var_desc = block->FindVar(var_name);
+            // Can't get feed op's TensorDesc
+            if (op_type != "feed" && var_desc && !var_desc->Persistable()) {
+              const auto shape = var_desc->GetShape();
+              if (shape.size() == 1) return false;
+            }
+          }
         }
       }
     }
@@ -2341,7 +2338,7 @@ struct SimpleOpTypeSetTeller : public Teller {
     }
 #endif
 
-    if (op_type == "equal") {
+    if (op_type == "equal" || op_type == "not_equal") {
 #if !IS_TRT_VERSION_GE(8000)
       VLOG(3) << "compare is not supported when TensorRT < 8.0";
       return false;
@@ -2493,6 +2490,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "elementwise_max",
       "elementwise_floordiv",
       "equal",
+      "not_equal",
       "less_than",
       "greater_than",
       "logical_or",
@@ -2639,6 +2637,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "elementwise_max",
       "elementwise_floordiv",
       "equal",
+      "not_equal",
       "less_than",
       "greater_than",
       "logical_or",
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index 27440c9408baac4a2d999cf8785a395585d16047..022c21a205dd4ace957c330afbb17fc6378d278f 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -330,3 +330,13 @@ REGISTER_OPERATOR(
     ops::ConvOpInferVarType,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+// This op is used by cutlass, conv2d_fusion_cutlass is a intermediate op
+// produced by conv2d_fusion_layout_transfer_pass.
+REGISTER_OPERATOR(
+    conv2d_fusion_cutlass,
+    ops::Conv2DFusionOp,
+    ops::Conv2DFusionOpMaker,
+    ops::ConvOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py
index d7121a2aeb567ab31e3ba17e21460f8be0af1f48..a0f4cfacfde38f25b5aa30f4482206095c8e236e 100644
--- a/paddle/fluid/operators/generator/generate_op.py
+++ b/paddle/fluid/operators/generator/generate_op.py
@@ -131,9 +131,10 @@ def process_int_array(op_item, int_array_configs):
                 )
                 if attr_item['is_support_tensor']:
                     attr_item['typename'] = (
-                        data_type_map[int_array_config['data_type']]
+                        'int[]'
                         if 'data_type' in int_array_config
-                        else 'std::vector<int64_t>'
+                        and int_array_config['data_type'] == 'int'
+                        else 'int64_t[]'
                     )
                 else:
                     attr_item['data_type'] = (
@@ -153,21 +154,95 @@ def process_int_array(op_item, int_array_configs):
 
 
 # replace name of op and params for OpMaker
-def replace_compat_name(op_op_map, forward_op_dict, backward_op_dict):
-    def get_op_and_op_name(op_item):
+def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
+    def get_phi_and_fluid_op_name(op_item):
         names = op_item.split('(')
         if len(names) == 1:
             return names[0].strip(), names[0].strip()
         else:
             return names[0].strip(), names[1].split(')')[0].strip()
 
-    def update_op_attr_name(attrs, attrs_alias_map):
-        for attr_item in attrs:
-            if attr_item['name'] in attrs_alias_map:
-                attr_item['name'] = attrs_alias_map[attr_item['name']]
+    def update_op_param_name(op_args, args_alias_map):
+        for item in op_args:
+            if item['name'] in args_alias_map:
+                item['name'] = args_alias_map[item['name']]
+
+    def update_grad_args_name(op_args, args_alias_map):
+        for item in op_args:
+            if (
+                item['name'].endswith('_grad')
+                and item['name'][:-5] in args_alias_map
+            ):
+                args_alias_map[item['name']] = (
+                    args_alias_map[item['name'][:-5]] + '_grad'
+                )
+                item['name'] = args_alias_map[item['name'][:-5]] + '_grad'
+
+    def get_param_list_alias(param_list, args_map):
+        return [
+            args_map[param] if param in args_map else param
+            for param in param_list
+        ]
 
-    for op_args in op_op_map:
-        new_op_name, op_name = get_op_and_op_name(op_args['op'])
+    def update_common_params_name(
+        op_item, args_name_map, scalar_configs, int_array_configs
+    ):
+        if 'inplace' in op_item and op_item['inplace']:
+            inplace_map = {}
+            for key, val in op_item['inplace'].items():
+                if key in args_map:
+                    key = args_map[key]
+                if val in args_map:
+                    val = args_map[val]
+                inplace_map[key] = val
+            op_item['inplace'] = inplace_map
+        if 'no_need_buffer' in op_item and op_item['no_need_buffer']:
+            op_item['no_need_buffer'] = get_param_list_alias(
+                op_item['no_need_buffer'], args_map
+            )
+
+        process_scalar(op_item, scalar_configs)
+        process_int_array(op_item, int_array_configs)
+
+        if 'invoke' in op_item:
+            op_item['invoke']['args'] = [
+                args_map[param.strip()]
+                if param.strip() in args_map
+                else param.strip()
+                for param in op_item['invoke']['args'].split(',')
+            ]
+            return
+        op_item['infer_meta']['param'] = get_param_list_alias(
+            op_item['infer_meta']['param'], args_name_map
+        )
+        op_item['kernel']['param'] = get_param_list_alias(
+            op_item['kernel']['param'], args_name_map
+        )
+        if op_item['kernel']['data_type']:
+            op_item['kernel']['data_type']['candidates'] = get_param_list_alias(
+                op_item['kernel']['data_type']['candidates'], args_name_map
+            )
+        if op_item['kernel']['backend']:
+            op_item['kernel']['backend']['candidates'] = get_param_list_alias(
+                op_item['kernel']['backend']['candidates'], args_name_map
+            )
+        if op_item['kernel']['layout']:
+            op_item['kernel']['layout']['candidates'] = get_param_list_alias(
+                op_item['kernel']['layout']['candidates'], args_name_map
+            )
+
+    def update_grad_op_compat_name(grad_op_item, args_name_map):
+        update_op_param_name(grad_op_item['inputs'], args_name_map)
+        update_op_param_name(grad_op_item['outputs'], args_name_map)
+        update_op_param_name(grad_op_item['attrs'], args_name_map)
+        update_op_param_name(grad_op_item['forward']['inputs'], args_name_map)
+        update_op_param_name(grad_op_item['forward']['outputs'], args_name_map)
+        update_op_param_name(grad_op_item['forward']['attrs'], args_name_map)
+        update_grad_args_name(grad_op_item['inputs'], args_map)
+        update_grad_args_name(grad_op_item['outputs'], args_map)
+
+    for op_args in op_fluid_map_list:
+        new_op_name, op_name = get_phi_and_fluid_op_name(op_args['op'])
         if new_op_name not in forward_op_dict:
             continue
         forward_op_item = forward_op_dict[new_op_name]
@@ -179,189 +254,102 @@ def replace_compat_name(op_op_map, forward_op_dict, backward_op_dict):
 
         scalar_configs = None
         int_array_configs = None
-
         if 'scalar' in op_args:
             scalar_configs = op_args['scalar']
         if 'int_array' in op_args:
             int_array_configs = op_args['int_array']
+        if 'extra' in op_args and 'outputs' in op_args['extra']:
+            for out_item in forward_op_item['outputs']:
+                if out_item['name'] in op_args['extra']['outputs']:
+                    out_item['is_extra'] = True
 
-        process_scalar(forward_op_item, scalar_configs)
-        process_int_array(forward_op_item, int_array_configs)
+        key_set = ['inputs', 'attrs', 'outputs']
+        args_map = {}
+        for key in key_set:
+            if key in op_args:
+                args_map.update(op_args[key])
+                for args_item in forward_op_item[key]:
+                    if args_item['name'] in op_args[key]:
+                        if (
+                            scalar_configs
+                            and args_item['name'] in scalar_configs
+                        ):
+                            scalar_configs[
+                                op_args[key][args_item['name']]
+                            ] = scalar_configs[args_item['name']]
+                        if (
+                            int_array_configs
+                            and args_item['name'] in int_array_configs
+                        ):
+                            int_array_configs[
+                                op_args[key][args_item['name']]
+                            ] = int_array_configs[args_item['name']]
+                        args_item['name'] = op_args[key][args_item['name']]
+                if has_backward:
+                    for args_item in backward_op_item['forward'][key]:
+                        if args_item['name'] in op_args[key]:
+                            args_item['name'] = op_args[key][args_item['name']]
+        forward_op_item["attr_dict"] = to_named_dict(forward_op_item["attrs"])
+        update_common_params_name(
+            forward_op_item, args_map, scalar_configs, int_array_configs
+        )
+
+        if has_backward:
+            update_grad_op_compat_name(backward_op_item, args_map)
+            update_common_params_name(
+                backward_op_item, args_map, scalar_configs, int_array_configs
+            )
+            backward_op_item["attr_dict"] = to_named_dict(
+                backward_op_item["attrs"]
+            )
+
+            if 'backward' not in op_args:
+                continue
 
-        if 'backward' in op_args and has_backward:
             backward_op_list = op_args['backward'].split(',')
-            _, bw_op_name = get_op_and_op_name(backward_op_list[0])
+            _, bw_op_name = get_phi_and_fluid_op_name(backward_op_list[0])
             forward_op_item['backward'] = bw_op_name
             backward_op_item['op_name'] = bw_op_name
 
-            process_scalar(backward_op_item, scalar_configs)
-            process_int_array(backward_op_item, int_array_configs)
-
             # for double grad
             if len(backward_op_list) > 1:
                 (
-                    new_double_grad_op_name,
+                    phi_double_grad_op_name,
                     double_grad_op_name,
-                ) = get_op_and_op_name(backward_op_list[1])
-                double_grad_item = backward_op_dict[new_double_grad_op_name]
+                ) = get_phi_and_fluid_op_name(backward_op_list[1])
+                double_grad_item = backward_op_dict[phi_double_grad_op_name]
                 backward_op_item['backward'] = double_grad_op_name
                 double_grad_item['op_name'] = double_grad_op_name
-                if 'attrs' in op_args:
-                    update_op_attr_name(
-                        double_grad_item['attrs'], op_args['attrs']
-                    )
-                    update_op_attr_name(
-                        double_grad_item['forward']['attrs'], op_args['attrs']
-                    )
-
-                process_scalar(double_grad_item, scalar_configs)
-                process_int_array(double_grad_item, int_array_configs)
+                update_grad_op_compat_name(double_grad_item, args_map)
+                update_common_params_name(
+                    double_grad_item,
+                    args_map,
+                    scalar_configs,
+                    int_array_configs,
+                )
+                double_grad_item["attr_dict"] = to_named_dict(
+                    double_grad_item["attrs"]
+                )
 
                 # for triple grad
                 if len(backward_op_list) > 2:
                     (
-                        new_triple_grad_op_name,
+                        phi_triple_grad_op_name,
                         triple_grad_op_name,
-                    ) = get_op_and_op_name(backward_op_list[2])
-                    triple_grad_item = backward_op_dict[new_triple_grad_op_name]
+                    ) = get_phi_and_fluid_op_name(backward_op_list[2])
+                    triple_grad_item = backward_op_dict[phi_triple_grad_op_name]
                     double_grad_item['backward'] = triple_grad_op_name
                     triple_grad_item['op_name'] = triple_grad_op_name
-                    if 'attrs' in op_args:
-                        update_op_attr_name(
-                            triple_grad_item['attrs'], op_args['attrs']
-                        )
-                        update_op_attr_name(
-                            triple_grad_item['forward']['attrs'],
-                            op_args['attrs'],
-                        )
-
-                    process_scalar(triple_grad_item, scalar_configs)
-                    process_int_array(triple_grad_item, int_array_configs)
-
-        key_set = ['inputs', 'attrs', 'outputs']
-        args_map = {}
-        for key in key_set:
-            if key in op_args:
-                args_map.update(op_args[key])
-                for args_item in forward_op_item[key]:
-                    if args_item['name'] in op_args[key]:
-                        args_item['name'] = op_args[key][args_item['name']]
-                if has_backward:
-                    for args_item in backward_op_item['forward'][key]:
-                        if args_item['name'] in op_args[key]:
-                            args_item['name'] = op_args[key][args_item['name']]
-        forward_op_item['infer_meta']['param'] = [
-            args_map[param] if param in args_map else param
-            for param in forward_op_item['infer_meta']['param']
-        ]
-        forward_op_item['kernel']['param'] = [
-            args_map[param] if param in args_map else param
-            for param in forward_op_item['kernel']['param']
-        ]
-        if forward_op_item['kernel']['data_type']:
-            forward_op_item['kernel']['data_type']['candidates'] = [
-                args_map[param] if param in args_map else param
-                for param in forward_op_item['kernel']['data_type'][
-                    'candidates'
-                ]
-            ]
-        if forward_op_item['kernel']['backend']:
-            forward_op_item['kernel']['backend']['candidates'] = [
-                args_map[param] if param in args_map else param
-                for param in forward_op_item['kernel']['backend']['candidates']
-            ]
-        if forward_op_item['kernel']['layout']:
-            forward_op_item['kernel']['layout']['candidates'] = [
-                args_map[param] if param in args_map else param
-                for param in forward_op_item['kernel']['layout']['candidates']
-            ]
-        if forward_op_item['inplace']:
-            inplace_map = {}
-            for key, val in forward_op_item['inplace'].items():
-                if key in args_map:
-                    key = args_map[key]
-                if val in args_map:
-                    val = args_map[val]
-                inplace_map[key] = val
-            forward_op_item['inplace'] = inplace_map
-
-        if has_backward:
-            for args_item in backward_op_item['inputs']:
-                if args_item['name'] in args_map:
-                    args_item['name'] = args_map[args_item['name']]
-                elif (
-                    args_item['name'].endswith('_grad')
-                    and args_item['name'][:-5] in args_map
-                ):
-                    args_map[args_item['name']] = (
-                        args_map[args_item['name'][:-5]] + '_grad'
+                    update_grad_op_compat_name(triple_grad_item, args_map)
+                    update_common_params_name(
+                        triple_grad_item,
+                        args_map,
+                        scalar_configs,
+                        int_array_configs,
                     )
-                    args_item['name'] = args_map[args_item['name']]
-            for args_item in backward_op_item['attrs']:
-                if args_item['name'] in args_map:
-                    args_item['name'] = args_map[args_item['name']]
-            for args_item in backward_op_item['outputs']:
-                if (
-                    args_item['name'].endswith('_grad')
-                    and args_item['name'][:-5] in args_map
-                ):
-                    args_map[args_item['name']] = (
-                        args_map[args_item['name'][:-5]] + '_grad'
+                    triple_grad_item["attr_dict"] = to_named_dict(
+                        triple_grad_item["attrs"]
                     )
-                    args_item['name'] = args_map[args_item['name']]
-
-            if 'invoke' in backward_op_item:
-                backward_op_item['invoke']['args'] = [
-                    args_map[param.strip()]
-                    if param.strip() in args_map
-                    else param.strip()
-                    for param in backward_op_item['invoke']['args'].split(',')
-                ]
-                continue
-
-            backward_op_item['infer_meta']['param'] = [
-                args_map[param] if param in args_map else param
-                for param in backward_op_item['infer_meta']['param']
-            ]
-            backward_op_item['kernel']['param'] = [
-                args_map[param] if param in args_map else param
-                for param in backward_op_item['kernel']['param']
-            ]
-            if backward_op_item['kernel']['data_type']:
-                backward_op_item['kernel']['data_type']['candidates'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['kernel']['data_type'][
-                        'candidates'
-                    ]
-                ]
-            if backward_op_item['kernel']['backend']:
-                backward_op_item['kernel']['backend']['candidates'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['kernel']['backend'][
-                        'candidates'
-                    ]
-                ]
-            if backward_op_item['kernel']['layout']:
-                backward_op_item['kernel']['layout']['candidates'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['kernel']['layout'][
-                        'candidates'
-                    ]
-                ]
-            if backward_op_item['no_need_buffer']:
-                backward_op_item['no_need_buffer'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['no_need_buffer']
-                ]
-            if backward_op_item['inplace']:
-                inplace_map = {}
-                for key, val in backward_op_item['inplace'].items():
-                    if key in args_map:
-                        key = args_map[key]
-                    if val in args_map:
-                        val = args_map[val]
-                    inplace_map[key] = val
-                backward_op_item['inplace'] = inplace_map
 
 
 def process_invoke_op(forward_op_dict, backward_op_dict):
@@ -372,6 +360,7 @@ def process_invoke_op(forward_op_dict, backward_op_dict):
             args_index = 0
             if invoke_op in forward_op_dict:
                 reuse_op = forward_op_dict[invoke_op]
+                bw_op['invoke']['func'] = reuse_op['op_name']
                 bw_op['invoke']['inputs'] = []
                 bw_op['invoke']['attrs'] = []
                 bw_op['invoke']['outputs'] = []
@@ -430,14 +419,14 @@ def main(
         forward_op_dict[op_version['op']]['version'] = op_version['version']
 
     with open(op_compat_yaml_path, "rt") as f:
-        op_op_map = yaml.safe_load(f)
+        op_fluid_map_list = yaml.safe_load(f)
 
     for op in ops:
         op['op_name'] = op['name']
     for bw_op in backward_ops:
         bw_op['op_name'] = bw_op['name']
 
-    replace_compat_name(op_op_map, forward_op_dict, backward_op_dict)
+    replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict)
 
     # prepare for invoke case
     process_invoke_op(forward_op_dict, backward_op_dict)
diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
index 0b49721afcc9e0f7e15b2d77dba8f527c59a86ee..b28c8bdc1a297891945626b5f4d009f7d41e77cc 100644
--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -54,6 +54,10 @@ AddOutput({{name | to_opmaker_name}}, "({{typename}}), output {{i}} of {{op_name
 
     .AsIntermediate()
   {%- endif %}
+  {%- if "is_extra" in output and output["is_extra"] %}
+
+    .AsExtra()
+  {%- endif %}
 {%- endmacro %}
 
 {# add attribute, and process default value if needed #}
@@ -115,7 +119,7 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
   paddle::small_vector<const char*> attrs;
   {% for attr in op["attrs"]%}
   {% filter indent(2)%}
-  {{get_an_attr(attr)}}
+  {{get_an_attr(attr, kernel_args)}}
   {% endfilter %}
   {% endfor %}
   {{get_output_list(op["outputs"], kernel_args)}};
@@ -170,7 +174,7 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
   paddle::small_vector<const char*> attrs;
   {% for attr in op["attrs"]%}
   {% filter indent(2)%}
-  {{get_an_attr(attr)}}
+  {{get_an_attr(attr, kernel_args)}}
   {% endfilter %}
   {% endfor %}
   {{get_output_list(op["outputs"], kernel_args)}};
@@ -209,8 +213,9 @@ paddle::small_vector<const char*> inputs {
 }
 {%- endmacro %}
 
-{% macro get_an_attr(attr) %}{# inline #}
+{% macro get_an_attr(attr, kernel_args) %}{# inline #}
 {% set typename = attr["typename"] %}
+{%- if attr["name"] in kernel_args %}
 {% set name = attr["name"] %}
 {% if typename is scalar %}{# scalar correspond to a dispensable input and an attr in opmaker #}
 attrs.emplace_back(ctx.HasInput("{{attr | to_scalar_tensor_name}}") ? "{{attr | to_scalar_tensor_name}}" : "{{name}}");
@@ -236,6 +241,7 @@ attrs.emplace_back(
 {%- else %}
 attrs.emplace_back("{{name}}");
 {%- endif %}
+{%- endif %}
 {%- endmacro %}
 
 {% macro get_output_list(outputs, kernel_args) %}{# inline #}
@@ -502,10 +508,9 @@ OutputGrad({{name_in_forward_orig | to_opmaker_name}})
     {% set name_in_forward = name[:-5] %}
     {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
 InputGrad({{name_in_forward_orig | to_opmaker_name}})
-  {%- elif (name | to_input_name) in input_names %}
-    {% set name_in_forward = name | to_input_name %}
-    {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
-InputGrad({{name | to_input_name | to_opmaker_name}})
+  {%- elif (name) in input_names %}
+    {% set name_in_forward_orig = input_orig_names[input_names.index(name)]%}
+Input({{name  | to_opmaker_name}})
   {%- endif %}
 {%- endmacro %}
 
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 4e6a10a912a8888337b44363ad166bb1dcc043bc..2951091508dd6d82af7ea0ccf5817e088d1f42c9 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -30,6 +30,13 @@ class PadOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
 };
 
 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -98,6 +105,14 @@ class PadOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, dout_dims);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 6e8b962488a56800eb0ad985acd0113fe9fd2422..e980aa66e7ca33467cfe216fbf04e3b5649d9c15 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -114,11 +114,6 @@ class ReshapeOp : public framework::OperatorWithKernel {
       return;
     }
 
-    PADDLE_ENFORCE_EQ(!shape.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The parameter 'shape' in ReshapeOp must be set. "
-                          "But received 'shape' is empty."));
     auto x_dims = ctx->GetInputDim("X");
     auto out_dims = ValidateShape(shape, x_dims);
     ctx->SetOutputDim("Out", out_dims);
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index bfadb45631210987410fc4b106859f4b146eabf3..7b023bcdf662cccfdfe89f9a2074c6a04bbfad33 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -195,17 +195,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-class Squeeze2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
 template <typename T>
 class SqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -220,32 +209,6 @@ class SqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-class Squeeze2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(
-        context->HasInput("XShape"), "Input", "XShape", "Squeeze2Grad");
-    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "Squeeze2Grad");
-    auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    context->SetOutputDim(framework::GradVarName("X"), x_dims);
-    context->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
 template <typename T>
 class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -259,82 +222,6 @@ class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
-// the XShape is used to carry the shape and lod of X which will be used in
-// squeeze_grad, in this way, the framework can reuse the memory of X
-// immediately the squeeze2_op is finished.
-// Considering compatibility issues, we could not fix squeeze2_op
-class Squeeze2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
-    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in SqueezeGradOp.")
-        .AsIntermediate()
-        .AsExtra();
-    AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of integers,"
-                              " indicating the dimensions to squeeze.")
-        .SetDefault({})
-        .SupportTensor();
-    AddComment(R"DOC(
-        Squeeze2 Operator.
-
-        Remove single-dimensional entries from the shape of a tensor.
-        Takes a parameter axes with a list of axes to squeeze.
-        If axes is not provided, all the single dimensions will be removed from the shape.
-        If an axis is selected with shape entry not equal to one, an error is raised.
-
-        Examples:
-        Case 1:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = [0]
-          we get:
-            Out.shape = (3, 1, 5)
-
-        Case 2:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = []
-          we get:
-            Out.shape = (3, 5)
-    )DOC");
-  }
-};
-
-template <typename T>
-class Squeeze2GradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("squeeze2_grad");
-    grad_op->SetInput("XShape", this->Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-template <typename T>
-class Squeeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("squeeze2");
-    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
-    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
-    grad_op->SetOutput("XShape", this->Input("XShape"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -345,10 +232,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
 
 namespace ops = paddle::operators;
 
-DECLARE_INFER_SHAPE_FUNCTOR(squeeze2,
-                            SqueezeInferShapeFunctor,
-                            PD_INFER_META(phi::SqueezeWithXShapeInferMeta));
-
 REGISTER_OPERATOR(squeeze,
                   ops::SqueezeOp,
                   ops::SqueezeOpMaker,
@@ -360,19 +243,6 @@ REGISTER_OPERATOR(squeeze_grad,
                   ops::SqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::SqueezeGradNoNeedBufferVarsInferer);
 
-REGISTER_OPERATOR(squeeze2,
-                  ops::Squeeze2Op,
-                  ops::Squeeze2OpMaker,
-                  ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeInplaceInferer,
-                  SqueezeInferShapeFunctor);
-REGISTER_OPERATOR(squeeze2_grad,
-                  ops::Squeeze2GradOp,
-                  ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeGradInplaceInferer);
-
 REGISTER_OP_CPU_KERNEL(
     squeeze,
     ops::SqueezeKernel<phi::CPUContext, float>,
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 8f28e0b606b0351d62c3a1429c3af8c988366a90..d092c03a56398488e44ab6bce5162b66568e607f 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -260,83 +260,6 @@ class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
-// unsqueeze, the XShape is used to carry the shape and lod of X which
-// will be used in unsqueeze_grad, in this way, the framework can reuse
-// the memory of X immediately the unsqueeze2_op is finished.
-// Considering compatibility issues, we could not fix unsqueeze2_op
-class Unsqueeze2Op : public UnsqueezeOp {
- public:
-  using UnsqueezeOp::UnsqueezeOp;
-};
-
-class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
- public:
-  void Make() override {
-    UnsqueezeOpMaker::Make();
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in UnsqueezeGradOp.")
-        .AsIntermediate()
-        .AsExtra();
-  }
-};
-
-template <typename T>
-class Unsqueeze2GradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("unsqueeze2_grad");
-    grad_op->SetInput("XShape", this->Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class Unsqueeze2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(
-        context->HasInput("XShape"),
-        true,
-        platform::errors::InvalidArgument("Input(XShape) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) shouldn't be null."));
-    auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    context->SetOutputDim(framework::GradVarName("X"), x_dims);
-    context->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.device_context());
-  }
-};
-
-template <typename T>
-class Unsqueeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("unsqueeze2");
-    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
-    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
-    grad_op->SetOutput("XShape", this->Input("XShape"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -345,10 +268,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
 }  // namespace operators
 }  // namespace paddle
 
-DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2,
-                            Unsqueeze2InferShapeFunctor,
-                            PD_INFER_META(phi::UnsqueezeWithXShapeInferMeta));
-
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unsqueeze,
                   ops::UnsqueezeOp,
@@ -362,20 +281,6 @@ REGISTER_OPERATOR(unsqueeze_grad,
                   ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradOpNoNeedBufferVarInferer);
 
-REGISTER_OPERATOR(unsqueeze2,
-                  ops::Unsqueeze2Op,
-                  ops::Unsqueeze2OpMaker,
-                  ops::Unsqueeze2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
-                  Unsqueeze2InferShapeFunctor,
-                  ops::UnsqueezeInplaceInferer);
-
-REGISTER_OPERATOR(unsqueeze2_grad,
-                  ops::Unsqueeze2GradOp,
-                  ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
-                  ops::UnsqueezeGradInplaceInferer);
-
 REGISTER_OP_CPU_KERNEL(
     unsqueeze,
     ops::UnsqueezeKernel<phi::CPUContext, float>,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 9a791e4f2e36243931216b409e03d83de8e26865..d314a9a7835190643b165ae287a52531d87b4b9d 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -646,6 +646,7 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("memory_pool_init_size_mb"),
            py::arg("device_id") = 0,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
+      .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass)
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("set_exec_stream",
            [](AnalysisConfig &self, phi::CUDAStream &stream) {
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index aec5c7632a8663f94830897a0bc6e0b7876ffa07..7be9e8fb187378e2e9028f5585e8c19cdf8792e8 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -44,11 +44,7 @@ set(PHI_DEPS
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
-if(APPLE AND WITH_ARM)
-  cc_library(phi DEPS ${PHI_DEPS})
-else()
-  create_dummy_static_lib(phi LIBS ${PHI_DEPS} LIMIT 100)
-endif()
+cc_library(phi DEPS ${PHI_DEPS})
 
 set(phi_extension_header_file
     ${CMAKE_CURRENT_SOURCE_DIR}/extension.h
diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h
index 7233744c65c3fd482810608cb04b6be5092e7f7b..17c0dd3f8732dde96d371d99bc8798692146a3f3 100644
--- a/paddle/phi/api/ext/tensor_compat.h
+++ b/paddle/phi/api/ext/tensor_compat.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 // Note(chenweihang): In order to be compatible with the original custom
 // operator Tensor interface, only available to external users, the file
-// cannot be includeed in paddle
+// cannot be included in paddle
 
 namespace paddle {
 using Tensor = experimental::Tensor;
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index dead42d03f7bcfdfc3fc6b0deabcf3ec8e4bb54d..8f107f02dafafad4c00ffcfcb5b51d3d551c5213 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1186,6 +1186,26 @@
   backward : square_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : squeeze_double_grad
+  forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axis)
+  output : Tensor(grad_out_grad), Tensor(xshape)
+  invoke: squeeze(grad_x_grad, axis)
+  intermediate : xshape
+
+- backward_op : squeeze_grad
+  forward : squeeze(Tensor x, IntArray axis) -> Tensor(out), Tensor(xshape)
+  args : (Tensor xshape, Tensor out_grad, IntArray axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : KernelWithXShapeInferMeta
+    param: [xshape]
+  kernel :
+    func : squeeze_grad
+    data_type : out_grad
+  inplace : (out_grad -> x_grad)
+  backward: squeeze_double_grad
+
 - backward_op : svd_grad
   forward : svd (Tensor x, bool full_matrices = false) -> Tensor(u), Tensor(s), Tensor(vh)
   args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices)
@@ -1321,6 +1341,27 @@
     data_type : out_grad
   no_need_buffer : x
 
+- backward_op : unsqueeze_double_grad
+  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axes)
+  output : Tensor(grad_out_grad), Tensor(xshape)
+  invoke : unsqueeze(grad_x_grad, axes)
+  intermediate : xshape
+
+- backward_op : unsqueeze_grad
+  forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
+  args : (Tensor xshape, Tensor out_grad, IntArray axes)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : KernelWithXShapeInferMeta
+    param: [xshape]
+  kernel :
+    func : unsqueeze_grad
+    param : [xshape, out_grad]
+    data_type : out_grad
+  inplace : (out_grad -> x_grad)
+  backward : unsqueeze_double_grad
+
 - backward_op : unstack_grad
   forward : unstack (Tensor x, int axis=0, int num=0) -> Tensor[](out)
   args : (Tensor[] out_grad, int axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 8d7af90a90a59a1b2e45d3ab235aa0c0c45a1c72..acc7b670ba524945fd32ff6d81ab351d18a3e268 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -1363,24 +1363,6 @@
   kernel :
     func : squared_l2_norm_grad
 
-- backward_op : squeeze_double_grad
-  forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x)
-  args : (Tensor grad_x_grad, IntArray axis)
-  output : Tensor(grad_out_grad)
-  invoke: squeeze(grad_x_grad, axis)
-
-- backward_op : squeeze_grad
-  forward : squeeze(Tensor x, IntArray axis) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad, IntArray axis)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : KernelWithXShapeInferMeta
-    param: [xshape]
-  kernel :
-    func : squeeze_grad
-  inplace : (out_grad -> x_grad)
-  backward: squeeze_double_grad
-
 - backward_op : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad, int axis)
@@ -1574,25 +1556,6 @@
     func : uniform_inplace_grad
   inplace : (out_grad -> x_grad)
 
-- backward_op : unsqueeze_double_grad
-  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
-  args : (Tensor grad_x_grad, IntArray axes)
-  output : Tensor(grad_out_grad)
-  invoke : unsqueeze(grad_x_grad, axes)
-
-- backward_op : unsqueeze_grad
-  forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad, IntArray axes)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : KernelWithXShapeInferMeta
-    param: [xshape]
-  kernel :
-    func : unsqueeze_grad
-    param: [xshape, out_grad]
-  inplace : (out_grad -> x_grad)
-  backward : unsqueeze_double_grad
-
 - backward_op : warpctc_grad
   forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad)
   args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index b93ca2944ab85f686a2fa6c83c5ba0455baeba92..6dfff5d510d65e5353aa7e79e29a159f0602dde6 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1777,18 +1777,6 @@
     func : squared_l2_norm
   backward : squared_l2_norm_grad
 
-- op : squeeze
-  args : (Tensor x, IntArray axis)
-  output : Tensor(out), Tensor(xshape)
-  infer_meta :
-    func : SqueezeWithXShapeInferMeta
-  kernel :
-    func : squeeze_with_xshape
-  inplace : (x -> out)
-  view: (x -> out)
-  intermediate : xshape
-  backward : squeeze_grad
-
 - op : stack
   args : (Tensor[] x, int axis)
   output : Tensor
@@ -2022,18 +2010,6 @@
     data_type: x
   backward: unpool3d_grad
 
-- op : unsqueeze
-  args : (Tensor x, IntArray axis)
-  output : Tensor(out), Tensor(xshape)
-  infer_meta :
-    func : UnsqueezeWithXShapeInferMeta
-  kernel :
-    func : unsqueeze_with_xshape
-  inplace : (x -> out)
-  view: (x -> out)
-  intermediate : xshape
-  backward : unsqueeze_grad
-
 - op : update_loss_scaling_
   args : (Tensor[] x, Tensor found_infinite, Tensor prev_loss_scaling, Tensor in_good_steps, Tensor in_bad_steps, int incr_every_n_steps, int decr_every_n_nan_or_inf, float incr_ratio, float decr_ratio, Scalar stop_update)
   output : Tensor[](out){x.size()}, Tensor(loss_scaling), Tensor(out_good_steps), Tensor(out_bad_steps)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 7e960d73bbbd71c2895b472cce0459280a54da13..cb6f67fbdf26641352086aa6cf3f9f475a65eea2 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1270,9 +1270,20 @@
     attrs : [bool use_mkldnn = false, bool use_cudnn = false]
 
 - op : squeeze (squeeze2)
-  backward : squeeze_grad (squeeze2_grad)
+  backward : squeeze_grad (squeeze2_grad), squeeze_double_grad(squeeze2_double_grad)
+  inputs :
+    x : X
+  attrs :
+   axis : axes
+  outputs :
+    {out : Out, xshape : XShape}
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    outputs : [xshape]
 
 - op : stack
   backward : stack_grad
@@ -1389,6 +1400,22 @@
   outputs :
     out : Y
 
+- op : unsqueeze (unsqueeze2)
+  backward : unsqueeze_grad (unsqueeze2_grad), unsqueeze_double_grad(unsqueeze2_double_grad)
+  inputs :
+    x : X
+  attrs :
+   axis : axes
+  outputs :
+    {out : Out, xshape : XShape}
+  int_array:
+    axis :
+      data_type : int
+      tensor_name : AxesTensor
+      tensors_name : AxesTensorList
+  extra :
+    outputs : [xshape]
+
 - op : unstack
   backward : unstack_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 0e85b2d8dffaf7db2256e7b0f6540a82e4c8220c..e5378ce07718b033921b668119f7751d9fa7e391 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1054,6 +1054,19 @@
            square_sr {selected_rows -> selected_rows}
   backward : square_grad
 
+- op : squeeze
+  args : (Tensor x, IntArray axis={})
+  output : Tensor(out), Tensor(xshape)
+  infer_meta :
+    func : SqueezeWithXShapeInferMeta
+  kernel :
+    func : squeeze_with_xshape
+    data_type : x
+  inplace : (x -> out)
+  view: (x -> out)
+  intermediate : xshape
+  backward : squeeze_grad
+
 - op : svd
   args : (Tensor x, bool full_matrices = false)
   output : Tensor(u), Tensor(s), Tensor(vh)
@@ -1149,6 +1162,19 @@
     func : unfold
   backward : unfold_grad
 
+- op : unsqueeze
+  args : (Tensor x, IntArray axis = {})
+  output : Tensor(out), Tensor(xshape)
+  infer_meta :
+    func : UnsqueezeWithXShapeInferMeta
+  kernel :
+    func : unsqueeze_with_xshape
+    data_type : x
+  inplace : (x -> out)
+  view: (x -> out)
+  intermediate : xshape
+  backward : unsqueeze_grad
+
 - op : unstack
   args : (Tensor x, int axis=0, int num=0)
   output : Tensor[](out){num}
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 895d50c7bbd2a7b5c63295efa1aaea1b5bbd279d..8a5247ae64baa10c3a234d0c0b98749376c31073 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -917,9 +917,6 @@ void ExpandInferMeta(const MetaTensor& x,
   auto out_rank =
       std::max(static_cast<size_t>(x_dims.size()), expand_shape.size());
   std::vector<int64_t> out_shape(out_rank);
-  auto x_dim_vec = phi::vectorize<int>(x_dims);
-  auto diff = expand_shape.size() - x_dim_vec.size();
-  x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
   for (size_t i = 0; i < expand_shape.size(); ++i) {
     if (x_dims[i] == -1) {
       out_shape[i] = -1;
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index e12c5f10fd1c4cb5a0da65d044f606e4af9f709b..25bbd17c4feab2dbc4a57b6578ebc3662ab83fcc 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -106,8 +106,7 @@ file(
   "fusion/gpu/*.cu")
 
 if(WITH_CUTLASS)
-  file(GLOB cutlass_cu "fusion/cutlass/default_moe_fc_traits.h"
-       "fusion/cutlass/linear_combination_ft_gelu.h" "fusion/cutlass/moe*")
+  file(GLOB cutlass_cu "fusion/cutlass/conv2d/*.cu" "fusion/cutlass/*.cu")
   list(APPEND kernel_cu ${cutlass_cu})
 endif()
 
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 49020337e08d8c5033338fdf0174de90ee98d1bf..cf974bdbe333b69fe3e7f10620c324aea14a9d19 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -1023,15 +1023,20 @@ void BroadcastKernel(const KPDevice &ctx,
                      std::vector<DenseTensor *> *outs,
                      int axis,
                      Functor func) {
-  std::vector<int> dims_size;
-  dims_size.reserve(ins.size());
+  // When there are multiple inputs, the outputs's rank should be equal the
+  // maximum rank of all inputs.
+  int max_rank = 0;
+  int min_rank = phi::DDim::kMaxRank;
   for (auto *in : ins) {
-    dims_size.emplace_back(in->dims().size());
+    max_rank = std::max(max_rank, in->dims().size());
+    min_rank = std::min(min_rank, in->dims().size());
   }
-
-  axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                          *std::min_element(dims_size.begin(), dims_size.end())
-                    : axis;
+  if (ins.size() == 1) {
+    // When there is only 1 input, the input's rank may be less than outputs'
+    // rank.
+    max_rank = std::max(max_rank, (*outs)[0]->dims().size());
+  }
+  axis = axis == -1 ? max_rank - min_rank : axis;
   BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
       ctx, ins, outs, axis, func);
 }
diff --git a/paddle/phi/kernels/funcs/dims_simplifier.h b/paddle/phi/kernels/funcs/dims_simplifier.h
index 3912357546944734dc7101122cda298d23a93b17..a52373c117e3ee113230d2e04f5604b9d58793b0 100644
--- a/paddle/phi/kernels/funcs/dims_simplifier.h
+++ b/paddle/phi/kernels/funcs/dims_simplifier.h
@@ -25,8 +25,8 @@ struct BroadcastDimsSimplifier {
   typedef void (*MergeFunctor)(
       bool &, std::vector<DimVector> &, DimVector &, int, int);
 
-  int64_t N;
-  int64_t rank;
+  int N;
+  int rank;
   DimVector out_dims;
   std::vector<DimVector> in_dims;
 
@@ -103,41 +103,43 @@ struct BroadcastDimsSimplifier {
   // To compensate the lackage of input_tensors' dimension with axis.
   void ExtendInputDimensions(int N, int axis) {
     for (auto &in_dim : in_dims) {
-      int64_t in_idx = 0;
       if (in_dim.size() < rank) {
-        DimVector tmp_dim(rank, 1);
-        for (; in_idx < in_dim.size();) {
-          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
-            tmp_dim[axis] = in_dim[in_idx];
-            in_idx++;
-            axis++;
+        DimVector extended_in_dim(rank, 1);
+        int out_idx = axis;
+        for (int in_idx = 0; in_idx < in_dim.size(); in_idx++) {
+          if (in_dim[in_idx] == out_dims[out_idx] || in_dim[in_idx] == 1) {
+            extended_in_dim[out_idx] = in_dim[in_idx];
+            out_idx++;
           } else {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "The %d-th dimension of input tensor is expected to be equal "
                 "with the %d-th dimension of output tensor %d or 1, but "
-                "received %d.",
-                in_idx + 1,
-                axis + 1,
+                "received %d. The input's shape is {%s}, the output's shape is "
+                "{%s}.",
+                in_idx,
+                out_idx,
                 out_dims[axis],
-                in_dim[in_idx]));
+                in_dim[in_idx],
+                phi::make_ddim(in_dim),
+                phi::make_ddim(out_dims)));
           }
         }
         in_dim.resize(rank);
-        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
+        std::copy(
+            extended_in_dim.begin(), extended_in_dim.end(), in_dim.begin());
       } else {
-        for (; in_idx < rank;) {
-          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
-            in_idx++;
-          } else {
-            PADDLE_THROW(phi::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "received %d.",
-                in_idx + 1,
-                in_idx + 1,
-                out_dims[in_idx],
-                in_dim[in_idx]));
-          }
+        for (int in_idx = 0; in_idx < rank; in_idx++) {
+          PADDLE_ENFORCE_EQ(
+              in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1,
+              true,
+              phi::errors::InvalidArgument(
+                  "The %d-th dimension of input tensor is expected to be equal "
+                  "with the %d-th dimension of output tensor %d or 1, but "
+                  "received %d.",
+                  in_idx,
+                  in_idx,
+                  out_dims[in_idx],
+                  in_dim[in_idx]));
         }
       }
       std::reverse(in_dim.begin(), in_dim.end());
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu
new file mode 100644
index 0000000000000000000000000000000000000000..308fd276c12be527d8fb21078eb6e95ba2ee4e6b
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu
@@ -0,0 +1,225 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+
+template <typename TShape, typename WShape, int Alignment = 8>
+cutlass::Status Conv2dBiasImpl(ConvAllParams params) {
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = float;
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+  using MMAOp = cutlass::arch::OpClassTensorOp;
+  using SmArch = cutlass::arch::Sm75;
+  using ThreadblockShape = TShape;
+  using WarpShape = WShape;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using SwizzleThreadBlock =
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>;
+  constexpr int NumStages = 2;
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kOptimized;
+  using EpilogueOp =
+      cutlass::epilogue::thread::LinearCombination<ElementOutput,
+                                                   Alignment,
+                                                   float,
+                                                   ElementComputeEpilogue>;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided,
+      Alignment,
+      Alignment>::Kernel;
+  using ImplicitGemm =
+      cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  const half *input = params.input;
+  const half *weight = params.weight;
+  const half *bias = params.bias;
+  half *output = params.output;
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  int oh = params.oh;
+  int ow = params.ow;
+  int dilation_h = params.dilation_h;
+  int dilation_w = params.dilation_w;
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+  cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic},
+                                                {oc, kh, kw, ic},
+                                                {pad_h0, 0, pad_w0, 0},
+                                                {stride_h, stride_w},
+                                                {dilation_h, dilation_w},
+                                                {batch, oh, ow, oc},
+                                                mode,
+                                                1);
+
+  typename ImplicitGemm::Arguments arguments{
+      problem_size,
+      {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}},
+      {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}},
+      {(cutlass::half_t *)(bias), {0, 0, 0}},
+      {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}},
+      {1.f, 1.f}};
+
+  ImplicitGemm implicit_gemm_op;
+  size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
+
+  auto ctx = params.ctx;
+  auto stream = ctx->stream();
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(
+          ctx->GetPlace(),
+          bytes,
+          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  void *workspace = tmp_gpu_ptrs_data->ptr();
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op.initialize(arguments, workspace);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op(stream);
+  CUTLASS_CHECK(status);
+  return status;
+}
+
+// config 0
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                                        cutlass::gemm::GemmShape<32, 32, 64>>(
+    ConvAllParams);
+// config 1
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                                        cutlass::gemm::GemmShape<32, 32, 64>>(
+    ConvAllParams);
+// config 2
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                                        cutlass::gemm::GemmShape<32, 32, 64>>(
+    ConvAllParams);
+// config 3
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                                        cutlass::gemm::GemmShape<32, 32, 64>>(
+    ConvAllParams);
+// config 4
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                                        cutlass::gemm::GemmShape<32, 32, 32>>(
+    ConvAllParams);
+// config 5
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                                        cutlass::gemm::GemmShape<32, 64, 32>>(
+    ConvAllParams);
+// config 6
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                                        cutlass::gemm::GemmShape<64, 64, 32>>(
+    ConvAllParams);
+// config 7
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                                        cutlass::gemm::GemmShape<64, 64, 32>>(
+    ConvAllParams);
+// config 8
+template cutlass::Status Conv2dBiasImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                                        cutlass::gemm::GemmShape<64, 32, 32>>(
+    ConvAllParams);
+
+std::vector<std::function<cutlass::Status(ConvAllParams)>>
+    conv2d_bias_all_func = {
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                       cutlass::gemm::GemmShape<32, 32, 32>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                       cutlass::gemm::GemmShape<32, 64, 32>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                       cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                       cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                       cutlass::gemm::GemmShape<64, 32, 32>>};
+
+std::map<std::vector<int>, int> map_problem_conv2d_bias;
+std::mutex conv2d_bias_mutex;
+
+void Conv2dBias(ConvAllParams params) {
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  std::vector<int> problem_size = {
+      batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w};
+
+  if (map_problem_conv2d_bias.count(problem_size)) {
+    conv2d_bias_all_func[map_problem_conv2d_bias.at(problem_size)](params);
+    return;
+  }
+
+  int best_config_index =
+      ProfileToGetBestConfig(conv2d_bias_all_func, params, CONV2D_BIAS);
+
+  std::lock_guard<std::mutex> guard(conv2d_bias_mutex);
+  map_problem_conv2d_bias[problem_size] = best_config_index;
+  conv2d_bias_all_func[best_config_index](params);
+}
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3fac4f5673b7f0bb09b3d1afca4213610227a1c0
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu
@@ -0,0 +1,248 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
+
+namespace phi {
+namespace fusion {
+
+namespace cutlass_internal {
+
+template <typename TShape, typename WShape, int Alignment = 8>
+cutlass::Status Conv2dBiasAddReluImpl(ConvAllParams params) {
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombinationResidualBlock<
+      cutlass::half_t,
+      float,
+      float,
+      cutlass::half_t,
+      Alignment,
+      cutlass::epilogue::thread::Identity,
+      cutlass::plus,
+      cutlass::epilogue::thread::ReLu>;
+
+  using Conv2dFpropKernel =
+      typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast<
+          cutlass::half_t,
+          cutlass::layout::TensorNHWC,
+          cutlass::half_t,
+          cutlass::layout::TensorNHWC,
+          cutlass::half_t,
+          cutlass::layout::TensorNHWC,
+          float,
+          cutlass::arch::OpClassTensorOp,
+          cutlass::arch::Sm75,
+          TShape,
+          WShape,
+          cutlass::gemm::GemmShape<16, 8, 8>,
+          EpilogueOp,
+          cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>,
+          2,
+          cutlass::arch::OpMultiplyAdd,
+          cutlass::conv::IteratorAlgorithm::kOptimized,
+          cutlass::conv::StrideSupport::kStrided,
+          Alignment,
+          Alignment>::Kernel;
+
+  using ImplicitGemm =
+      cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  const half *input = params.input;
+  const half *weight = params.weight;
+  const half *bias = params.bias;
+
+  half *output = params.output;
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+  const half *residual = params.residual;
+
+  int oh = params.oh;
+  int ow = params.ow;
+  int dilation_h = params.dilation_h;
+  int dilation_w = params.dilation_w;
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+      {batch, ih, iw, ic},
+      {oc, kh, kw, ic},
+      {pad_h0, 0, pad_w0, 0},
+      {stride_h, stride_w},
+      {dilation_h, dilation_w},
+      {batch, oh, ow, oc},
+      cutlass::conv::Mode::kCrossCorrelation,
+      1);
+
+  typename ImplicitGemm::Arguments arguments{
+      problem_size,
+      {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}},
+      {(cutlass::half_t *)weight, {ic, ic * kw, ic * kw * kh}},
+      {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}},
+      {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}},
+      {1.f, 1.f},
+      cutlass::conv::SplitKMode::kSerial,
+      (cutlass::half_t *)(bias),
+      nullptr,
+      0,
+      oc};
+
+  ImplicitGemm implicit_gemm_op;
+  size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
+
+  auto ctx = params.ctx;
+  auto stream = ctx->stream();
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(
+          ctx->GetPlace(),
+          bytes,
+          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  void *workspace = tmp_gpu_ptrs_data->ptr();
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op.initialize(arguments, workspace);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op(stream);
+  CUTLASS_CHECK(status);
+  return status;
+}
+
+// config 0
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                          cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 1
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                          cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 2
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                          cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 3
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                          cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 4
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                          cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams);
+// config 5
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                          cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams);
+// config 6
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                          cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 7
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                          cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 8
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                          cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams);
+// config 9
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 128, 32>,
+                          cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 10
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 256, 32>,
+                          cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 11
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<256, 64, 32>,
+                          cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 12
+template cutlass::Status
+    Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<256, 128, 32>,
+                          cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+
+std::vector<std::function<cutlass::Status(ConvAllParams)>>
+    conv2d_bias_add_relu_all_func = {
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                              cutlass::gemm::GemmShape<32, 32, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                              cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                              cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                              cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                              cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                              cutlass::gemm::GemmShape<32, 64, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                              cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                              cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                              cutlass::gemm::GemmShape<64, 32, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 128, 32>,
+                              cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<128, 256, 32>,
+                              cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<256, 64, 32>,
+                              cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasAddReluImpl<cutlass::gemm::GemmShape<256, 128, 32>,
+                              cutlass::gemm::GemmShape<64, 64, 32>>};
+std::map<std::vector<int>, int> map_problem_conv2d_bias_add_relu;
+std::mutex conv2d_bias_add_relu_mutex;
+
+void Conv2dBiasAddRelu(ConvAllParams params) {
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  std::vector<int> problem_size = {
+      batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w};
+
+  if (map_problem_conv2d_bias_add_relu.count(problem_size)) {
+    conv2d_bias_add_relu_all_func[map_problem_conv2d_bias_add_relu.at(
+        problem_size)](params);
+    return;
+  }
+
+  std::lock_guard<std::mutex> guard(conv2d_bias_add_relu_mutex);
+
+  // config 6's diff is large.
+  conv2d_bias_add_relu_all_func[6] = nullptr;
+
+  int best_config_index = ProfileToGetBestConfig(
+      conv2d_bias_add_relu_all_func, params, CONV2D_BIAS_ADD_RELU);
+  map_problem_conv2d_bias_add_relu[problem_size] = best_config_index;
+  conv2d_bias_add_relu_all_func[best_config_index](params);
+}
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..97ca75e477644ccddf83d4dd24b3b1b98cc04769
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu
@@ -0,0 +1,226 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/epilogue/thread/linear_combination_leaky_relu.h"
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+template <typename TShape, typename WShape, int Alignment = 8>
+cutlass::Status Conv2dBiasLeakyReluImpl(ConvAllParams params) {
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = float;
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+  using MMAOp = cutlass::arch::OpClassTensorOp;
+  using SmArch = cutlass::arch::Sm75;
+  using ThreadblockShape = TShape;
+  using WarpShape = WShape;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using SwizzleThreadBlock =
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>;
+  constexpr int NumStages = 2;
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kOptimized;
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombinationLeakyRelu<
+      ElementOutput,
+      Alignment,
+      float,
+      ElementComputeEpilogue>;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided,
+      Alignment,
+      Alignment>::Kernel;
+  using ImplicitGemm =
+      cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  const half *input = params.input;
+  const half *weight = params.weight;
+  const half *bias = params.bias;
+  half *output = params.output;
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+  float alpha = params.alpha;
+
+  int oh = params.oh;
+  int ow = params.ow;
+  int dilation_h = params.dilation_h;
+  int dilation_w = params.dilation_w;
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+  cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic},
+                                                {oc, kh, kw, ic},
+                                                {pad_h0, 0, pad_w0, 0},
+                                                {stride_h, stride_w},
+                                                {dilation_h, dilation_w},
+                                                {batch, oh, ow, oc},
+                                                mode,
+                                                1);
+
+  typename ImplicitGemm::Arguments arguments{
+      problem_size,
+      {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}},
+      {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}},
+      {(cutlass::half_t *)(bias), {0, 0, 0}},
+      {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}},
+      {1.f, 1.f, alpha}};
+
+  ImplicitGemm implicit_gemm_op;
+  size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
+
+  auto ctx = params.ctx;
+  auto stream = ctx->stream();
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(
+          ctx->GetPlace(),
+          bytes,
+          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  void *workspace = tmp_gpu_ptrs_data->ptr();
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op.initialize(arguments, workspace);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op(stream);
+  CUTLASS_CHECK(status);
+  return status;
+}
+
+// config 0
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 1
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 2
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 3
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 4
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams);
+// config 5
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams);
+// config 6
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 7
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 8
+template cutlass::Status Conv2dBiasLeakyReluImpl<
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams);
+
+std::vector<std::function<cutlass::Status(ConvAllParams)>>
+    conv2d_bias_leaky_relu_all_func = {
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                                cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                                cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                                cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                                cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                                cutlass::gemm::GemmShape<32, 32, 32>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                                cutlass::gemm::GemmShape<32, 64, 32>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                                cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                                cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasLeakyReluImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                                cutlass::gemm::GemmShape<64, 32, 32>>};
+
+std::map<std::vector<int>, int> map_problem_conv2d_bias_leaky_relu;
+std::mutex conv2d_bias_leaky_relu_mutex;
+
+void Conv2dBiasLeakyRelu(ConvAllParams params) {
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  std::vector<int> problem_size = {
+      batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w};
+
+  if (map_problem_conv2d_bias_leaky_relu.count(problem_size)) {
+    conv2d_bias_leaky_relu_all_func[map_problem_conv2d_bias_leaky_relu.at(
+        problem_size)](params);
+    return;
+  }
+
+  int best_config_index = ProfileToGetBestConfig(
+      conv2d_bias_leaky_relu_all_func, params, CONV2D_BIAS_LEAKY_RELU);
+
+  std::lock_guard<std::mutex> guard(conv2d_bias_leaky_relu_mutex);
+  map_problem_conv2d_bias_leaky_relu[problem_size] = best_config_index;
+  conv2d_bias_leaky_relu_all_func[best_config_index](params);
+}
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a5f5a9bee12c644b12d5b493407e17b7ccaef6e1
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu
@@ -0,0 +1,225 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+template <typename TShape, typename WShape, int Alignment = 8>
+cutlass::Status Conv2dBiasReluImpl(ConvAllParams params) {
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = float;
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+  using MMAOp = cutlass::arch::OpClassTensorOp;
+  using SmArch = cutlass::arch::Sm75;
+  using ThreadblockShape = TShape;
+  using WarpShape = WShape;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using SwizzleThreadBlock =
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>;
+  constexpr int NumStages = 2;
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kOptimized;
+  using EpilogueOp =
+      cutlass::epilogue::thread::LinearCombinationRelu<ElementOutput,
+                                                       Alignment,
+                                                       float,
+                                                       ElementComputeEpilogue>;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided,
+      Alignment,
+      Alignment>::Kernel;
+  using ImplicitGemm =
+      cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  const half *input = params.input;
+  const half *weight = params.weight;
+  const half *bias = params.bias;
+  half *output = params.output;
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+  int oh = params.oh;
+  int ow = params.ow;
+  int dilation_h = params.dilation_h;
+  int dilation_w = params.dilation_w;
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+  cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic},
+                                                {oc, kh, kw, ic},
+                                                {pad_h0, 0, pad_w0, 0},
+                                                {stride_h, stride_w},
+                                                {dilation_h, dilation_w},
+                                                {batch, oh, ow, oc},
+                                                mode,
+                                                1);
+
+  typename ImplicitGemm::Arguments arguments{
+      problem_size,
+      {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}},
+      {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}},
+      {(cutlass::half_t *)(bias), {0, 0, 0}},
+      {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}},
+      {1.f, 1.f}};
+
+  ImplicitGemm implicit_gemm_op;
+  size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
+
+  auto ctx = params.ctx;
+  auto stream = ctx->stream();
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(
+          ctx->GetPlace(),
+          bytes,
+          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  void *workspace = tmp_gpu_ptrs_data->ptr();
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op.initialize(arguments, workspace);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op(stream);
+  CUTLASS_CHECK(status);
+  return status;
+}
+
+// config 0
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 1
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 2
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 3
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 4
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                       cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams);
+// config 5
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                       cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams);
+// config 6
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                       cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 7
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                       cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 8
+template cutlass::Status
+    Conv2dBiasReluImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                       cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams);
+
+std::vector<std::function<cutlass::Status(ConvAllParams)>>
+    conv2d_bias_relu_all_func = {
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                           cutlass::gemm::GemmShape<32, 32, 32>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                           cutlass::gemm::GemmShape<32, 64, 32>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                           cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                           cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasReluImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                           cutlass::gemm::GemmShape<64, 32, 32>>};
+std::map<std::vector<int>, int> map_problem_conv2d_bias_relu;
+std::mutex conv2d_bias_relu_mutex;
+
+void Conv2dBiasRelu(ConvAllParams params) {
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  std::vector<int> problem_size = {
+      batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w};
+
+  if (map_problem_conv2d_bias_relu.count(problem_size)) {
+    conv2d_bias_relu_all_func[map_problem_conv2d_bias_relu.at(problem_size)](
+        params);
+    return;
+  }
+
+  int best_config_index = ProfileToGetBestConfig(
+      conv2d_bias_relu_all_func, params, CONV2D_BIAS_RELU);
+
+  std::lock_guard<std::mutex> guard(conv2d_bias_relu_mutex);
+  map_problem_conv2d_bias_relu[problem_size] = best_config_index;
+  conv2d_bias_relu_all_func[best_config_index](params);
+}
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1acd191033529eb3b0aff8c616fec21be59f5265
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
@@ -0,0 +1,218 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+template <typename TShape, typename WShape, int Alignment = 1>
+cutlass::Status Conv2dBiasReluFewChannelsImpl(ConvAllParams params) {
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = float;
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+  using MMAOp = cutlass::arch::OpClassTensorOp;
+  using SmArch = cutlass::arch::Sm75;
+  using ThreadblockShape = TShape;
+  using WarpShape = WShape;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using SwizzleThreadBlock =
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>;
+  constexpr int NumStages = 2;
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kFewChannels;
+  using EpilogueOp =
+      cutlass::epilogue::thread::LinearCombinationRelu<ElementOutput,
+                                                       Alignment,
+                                                       float,
+                                                       ElementComputeEpilogue>;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided,
+      Alignment,
+      Alignment>::Kernel;
+  using ImplicitGemm =
+      cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  const half *input = params.input;
+  const half *weight = params.weight;
+  const half *bias = params.bias;
+  half *output = params.output;
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w1;
+
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  int oh = params.oh;
+  int ow = params.ow;
+  int dilation_h = params.dilation_h;
+  int dilation_w = params.dilation_w;
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+  cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic},
+                                                {oc, kh, kw, ic},
+                                                {pad_h0, 0, pad_w0, 0},
+                                                {stride_h, stride_w},
+                                                {dilation_h, dilation_w},
+                                                {batch, oh, ow, oc},
+                                                mode,
+                                                1);
+
+  typename ImplicitGemm::Arguments arguments{
+      problem_size,
+      {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}},
+      {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}},
+      {(cutlass::half_t *)(bias), {0, 0, 0}},
+      {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}},
+      {1.f, 1.f}};
+
+  ImplicitGemm implicit_gemm_op;
+  size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
+
+  auto ctx = params.ctx;
+  auto stream = ctx->stream();
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(
+          ctx->GetPlace(),
+          bytes,
+          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  void *workspace = tmp_gpu_ptrs_data->ptr();
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op.initialize(arguments, workspace);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op(stream);
+  CUTLASS_CHECK(status);
+  return status;
+}
+
+// config 0
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 1
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<64, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 2
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<128, 32, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 3
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<128, 64, 64>,
+    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 4
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams);
+// config 5
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams);
+// config 6
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<64, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 7
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 8
+template cutlass::Status Conv2dBiasReluFewChannelsImpl<
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams);
+
+std::vector<std::function<cutlass::Status(ConvAllParams)>>
+    conv2d_bias_relu_few_channels_all_func = {
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                                      cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                                      cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                                      cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                                      cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                                      cutlass::gemm::GemmShape<32, 32, 32>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                                      cutlass::gemm::GemmShape<32, 64, 32>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                                      cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                                      cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                                      cutlass::gemm::GemmShape<64, 32, 32>>};
+std::map<std::vector<int>, int> map_problem_conv2d_bias_relu_few_channels;
+
+void Conv2dBiasReluFewChannels(ConvAllParams params) {
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w1;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  std::vector<int> problem_size = {
+      batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w};
+
+  if (map_problem_conv2d_bias_relu_few_channels.count(problem_size)) {
+    conv2d_bias_relu_few_channels_all_func
+        [map_problem_conv2d_bias_relu_few_channels.at(problem_size)](params);
+    return;
+  }
+  //
+}
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..469585ccf8398b9e49d12eb2accc1481eb9b84af
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu
@@ -0,0 +1,226 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/epilogue/thread/linear_combination_silu.h"
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+template <typename TShape, typename WShape, int Alignment = 8>
+cutlass::Status Conv2dBiasSiluImpl(ConvAllParams params) {
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = float;
+  using ElementInputA = cutlass::half_t;
+  using ElementInputB = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutInputA = cutlass::layout::TensorNHWC;
+  using LayoutInputB = cutlass::layout::TensorNHWC;
+  using LayoutOutput = cutlass::layout::TensorNHWC;
+  using MMAOp = cutlass::arch::OpClassTensorOp;
+  using SmArch = cutlass::arch::Sm75;
+  using ThreadblockShape = TShape;
+  using WarpShape = WShape;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using SwizzleThreadBlock =
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>;
+  constexpr int NumStages = 2;
+  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
+      cutlass::conv::IteratorAlgorithm::kOptimized;
+  using EpilogueOp =
+      cutlass::epilogue::thread::LinearCombinationSilu<ElementOutput,
+                                                       Alignment,
+                                                       float,
+                                                       ElementComputeEpilogue>;
+
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+      ElementInputA,
+      LayoutInputA,
+      ElementInputB,
+      LayoutInputB,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      MMAOp,
+      SmArch,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages,
+      cutlass::arch::OpMultiplyAdd,
+      IteratorAlgorithm,
+      cutlass::conv::StrideSupport::kStrided,
+      Alignment,
+      Alignment>::Kernel;
+  using ImplicitGemm =
+      cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  const half *input = params.input;
+  const half *weight = params.weight;
+  const half *bias = params.bias;
+  half *output = params.output;
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  int oh = params.oh;
+  int ow = params.ow;
+  int dilation_h = params.dilation_h;
+  int dilation_w = params.dilation_w;
+
+  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+  cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic},
+                                                {oc, kh, kw, ic},
+                                                {pad_h0, 0, pad_w0, 0},
+                                                {stride_h, stride_w},
+                                                {dilation_h, dilation_w},
+                                                {batch, oh, ow, oc},
+                                                mode,
+                                                1);
+
+  typename ImplicitGemm::Arguments arguments{
+      problem_size,
+      {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}},
+      {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}},
+      {(cutlass::half_t *)(bias), {0, 0, 0}},
+      {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}},
+      {1.f, 1.f}};
+
+  ImplicitGemm implicit_gemm_op;
+  size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
+
+  auto ctx = params.ctx;
+  auto stream = ctx->stream();
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(
+          ctx->GetPlace(),
+          bytes,
+          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  void *workspace = tmp_gpu_ptrs_data->ptr();
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op.initialize(arguments, workspace);
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op(stream);
+  CUTLASS_CHECK(status);
+  return status;
+}
+
+// config 0
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 1
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 2
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 3
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                       cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
+// config 4
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                       cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams);
+// config 5
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                       cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams);
+// config 6
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                       cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 7
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                       cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
+// config 8
+template cutlass::Status
+    Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                       cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams);
+
+std::vector<std::function<cutlass::Status(ConvAllParams)>>
+    conv2d_bias_silu_all_func = {
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 64, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 32, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<128, 32, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<128, 64, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 64, 32>,
+                           cutlass::gemm::GemmShape<32, 32, 32>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 128, 32>,
+                           cutlass::gemm::GemmShape<32, 64, 32>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 128, 64>,
+                           cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<64, 256, 32>,
+                           cutlass::gemm::GemmShape<64, 64, 32>>,
+        Conv2dBiasSiluImpl<cutlass::gemm::GemmShape<128, 64, 32>,
+                           cutlass::gemm::GemmShape<64, 32, 32>>};
+
+std::map<std::vector<int>, int> map_problem_conv2d_bias_silu;
+std::mutex conv2d_bias_silu_mutex;
+
+void Conv2dBiasSilu(ConvAllParams params) {
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h0 = params.pad_h0;
+  int pad_w0 = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+
+  std::vector<int> problem_size = {
+      batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w};
+
+  if (map_problem_conv2d_bias_silu.count(problem_size)) {
+    conv2d_bias_silu_all_func[map_problem_conv2d_bias_silu.at(problem_size)](
+        params);
+    return;
+  }
+
+  int best_config_index = ProfileToGetBestConfig(
+      conv2d_bias_silu_all_func, params, CONV2D_BIAS_SILU);
+
+  std::lock_guard<std::mutex> guard(conv2d_bias_silu_mutex);
+
+  map_problem_conv2d_bias_silu[problem_size] = best_config_index;
+  conv2d_bias_silu_all_func[best_config_index](params);
+}
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b740d49fc1dc3fb0e021e80f436e783c5a392aea
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cuda_fp16.h>
+#include <glog/logging.h>
+#include <map>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+
+typedef struct {
+  const half *input;
+  const half *weight;
+  const half *bias;
+  const half *residual;
+  half *output;
+  int batch;
+  int ic;
+  int ih;
+  int iw;
+  int kh;
+  int kw;
+  int oc;
+  int pad_h0;
+  int pad_h1;
+  int pad_w0;
+  int pad_w1;
+  int stride_h;
+  int stride_w;
+  int dilation_h;
+  int dilation_w;
+  int oh;
+  int ow;
+  const phi::GPUContext *ctx;
+  float alpha;  // for leaky_relu use
+} ConvAllParams;
+
+// Below functions are provided by cutlass, they are called by phi.
+void Conv2dBiasAddRelu(ConvAllParams params);
+void Conv2dBiasRelu(ConvAllParams params);
+void Conv2dBiasLeakyRelu(ConvAllParams params);
+void Conv2dBiasSilu(ConvAllParams params);
+void Conv2dBias(ConvAllParams params);
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
new file mode 100644
index 0000000000000000000000000000000000000000..174cb4aaa405956811c8b3203a1c08f97376be97
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
@@ -0,0 +1,277 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+struct logical_coord {
+  int n;
+  int c;
+  int h;
+  int w;
+};
+
+float diff(const half *c, const float *c_baseline, int n) {
+  float max_diff = -1.;
+  for (int i = 0; i < n; i++) {
+    float c_value = __half2float(c[i]);
+    if (std::abs(c_baseline[i] - c_value) > max_diff) {
+      max_diff = std::abs(c_baseline[i] - c_value);
+    }
+  }
+  return max_diff;
+}
+
+__device__ int gpu_nhwc(struct logical_coord shape,
+                        struct logical_coord index) {
+  return index.n * shape.h * shape.w * shape.c + index.h * shape.w * shape.c +
+         index.w * shape.c + index.c;
+}
+
+__global__ void naive_conv2d_kernel(const half *input,
+                                    const half *weight,
+                                    const half *bias,
+                                    float *output,
+                                    int batch,
+                                    int ic,
+                                    int ih,
+                                    int iw,
+                                    int kh,
+                                    int kw,
+                                    int oc,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int dilation_h,
+                                    int dilation_w,
+                                    int oh,
+                                    int ow,
+                                    const half *residual,
+                                    float alpha,  // for leaky_relu
+                                    OpType op_type) {
+  int M = batch * oh * ow;
+  int N = oc;
+  int K = ic * kh * kw;
+  int m_i = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_i = threadIdx.y + blockIdx.y * blockDim.y;
+  if (m_i >= M || n_i >= N) return;
+
+  int batch_i = m_i / (oh * ow);
+  int oh_i = (m_i % (oh * ow)) / ow;
+  int ow_i = (m_i % (oh * ow)) % ow;
+  int oc_i = n_i;
+
+  struct logical_coord weight_shape = {oc, ic, kh, kw};
+  struct logical_coord input_shape = {batch, ic, ih, iw};
+  int out_offset = m_i * N + n_i;
+  float *out_ptr = output + out_offset;
+  float sum = 0.f;
+
+  for (int k_i = 0; k_i < K; k_i++) {
+    int ic_i = k_i / (kh * kw);
+    int kh_i = (k_i % (kh * kw)) / kw;
+    int kw_i = (k_i % (kh * kw)) % kw;
+
+    struct logical_coord weight_index = {oc_i, ic_i, kh_i, kw_i};
+
+    int ih_i = oh_i * stride_h - pad_h + kh_i * dilation_h;
+    int iw_i = ow_i * stride_w - pad_w + kw_i * dilation_w;
+
+    if (ih_i < 0 || ih_i >= ih) continue;
+    if (iw_i < 0 || iw_i >= iw) continue;
+
+    struct logical_coord input_index = {batch_i, ic_i, ih_i, iw_i};
+    const half *weight_ptr = weight + gpu_nhwc(weight_shape, weight_index);
+    const half *in_ptr = input + gpu_nhwc(input_shape, input_index);
+    sum += __half2float(*in_ptr) * __half2float(*weight_ptr);
+  }
+
+  sum += __half2float(*(bias + oc_i));
+  float x = sum;
+
+  switch (op_type) {
+    case CONV2D_BIAS:
+      *out_ptr = x;
+      break;
+    case CONV2D_BIAS_RELU:
+      *out_ptr = x > 0 ? x : 0;
+      break;
+    case CONV2D_BIAS_SILU:
+      *out_ptr = x * (1.f / (1 + exp(-x)));
+      break;
+    case CONV2D_BIAS_ADD_RELU:
+      x += __half2float(*(residual + out_offset));
+      *out_ptr = x > 0 ? x : 0;
+      break;
+    case CONV2D_BIAS_LEAKY_RELU:
+      *out_ptr = x > 0 ? x : (x * alpha);
+      break;
+    default:
+      break;
+  }
+}
+
+float conv2d_diff_gpu(ConvAllParams params, OpType op_type) {
+  const half *input = params.input;
+  const half *weight = params.weight;
+  const half *bias = params.bias;
+  half *output = params.output;
+  int batch = params.batch;
+  int ic = params.ic;
+  int ih = params.ih;
+  int iw = params.iw;
+  int kh = params.kh;
+  int kw = params.kw;
+  int oc = params.oc;
+  int pad_h = params.pad_h0;
+  int pad_w = params.pad_w0;
+  int stride_h = params.stride_h;
+  int stride_w = params.stride_w;
+  int dilation_h = params.dilation_h;
+  int dilation_w = params.dilation_w;
+  const half *residual = params.residual;
+
+  int oh = params.oh;
+  int ow = params.ow;
+  int M = batch * oh * ow;
+  int N = oc;
+
+  constexpr int blockM = 16;
+  constexpr int blockN = 16;
+  uint3 grid = {(M + blockM - 1) / blockM, (N + blockN - 1) / blockN, 1};
+  uint3 block = {blockM, blockN, 1};
+
+  int output_size = batch * oc * oh * ow;
+  half *output_from_cutlass =
+      reinterpret_cast<half *>(malloc(sizeof(half) * output_size));
+  cudaMemcpy(output_from_cutlass,
+             output,
+             output_size * sizeof(half),
+             cudaMemcpyDeviceToHost);
+
+  float *gpu_output;
+  cudaMalloc(&gpu_output, output_size * sizeof(float));
+  naive_conv2d_kernel<<<grid, block>>>(input,
+                                       weight,
+                                       bias,
+                                       gpu_output,
+                                       batch,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       kh,
+                                       kw,
+                                       oc,
+                                       pad_h,
+                                       pad_w,
+                                       stride_h,
+                                       stride_w,
+                                       dilation_h,
+                                       dilation_w,
+                                       oh,
+                                       ow,
+                                       residual,
+                                       params.alpha,
+                                       op_type);
+  float *output_from_gpu =
+      reinterpret_cast<float *>(malloc(sizeof(float) * output_size));
+  cudaMemcpy(output_from_gpu,
+             gpu_output,
+             output_size * sizeof(float),
+             cudaMemcpyDeviceToHost);
+  float max_diff = diff(output_from_cutlass, output_from_gpu, output_size);
+
+  free(output_from_cutlass);
+  free(output_from_gpu);
+  cudaFree(gpu_output);
+  return max_diff;
+}
+
+std::string OpType2String(OpType op_type) {
+  switch (op_type) {
+    case CONV2D_BIAS:
+      return "conv2d_bias";
+      break;
+    case CONV2D_BIAS_RELU:
+      return "conv2d_bias_relu";
+      break;
+    case CONV2D_BIAS_SILU:
+      return "conv2d_bias_add_silu";
+      break;
+    case CONV2D_BIAS_ADD_RELU:
+      return "conv2d_bias_add_relu";
+      break;
+    case CONV2D_BIAS_LEAKY_RELU:
+      return "conv2d_bias_leaky_relu";
+    default:
+      break;
+  }
+  return "unnamed_op";
+}
+
+int ProfileToGetBestConfig(
+    const std::vector<std::function<cutlass::Status(ConvAllParams)>> &all_func,
+    ConvAllParams params,
+    OpType op_type) {
+  constexpr int WARMUP = 10;
+  constexpr int REPEAT = 100;
+  float min_time = 100000.f;
+  int min_time_index = -1;
+  for (int i = 0; i < all_func.size(); i++) {
+    cutlass::Status status;
+    auto func = all_func[i];
+    // When func has large diff, we will make it nullptr.
+    if (!func) continue;
+
+    for (int ii = 0; ii < WARMUP; ii++) {
+      status = func(params);
+    }
+
+    cudaEvent_t beg, end;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&beg));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&end));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(beg));
+    for (int ii = 0; ii < REPEAT; ii++) {
+      status = func(params);
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(end));
+    float elapsed_time;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(&elapsed_time, beg, end));
+    if (elapsed_time < min_time && status == cutlass::Status::kSuccess) {
+      min_time = elapsed_time;
+      min_time_index = i;
+    }
+    // debug code
+    VLOG(3) << OpType2String(op_type) << ": tactic " << i << " has max diff "
+            << conv2d_diff_gpu(params, op_type) << " compared with baseline.";
+  }
+
+  if (min_time_index < 0) {
+    PADDLE_THROW(
+        phi::errors::NotFound("Can't find any cutlass config for this %s op.",
+                              OpType2String(op_type).c_str()));
+  }
+  return min_time_index;
+}
+
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5d0f83651488ee718de1e07ba2ae96b998c6c52
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda_fp16.h>
+#include <vector>
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+#define CUTLASS_CHECK(status)                                                \
+  if (status != cutlass::Status::kSuccess) {                                 \
+    VLOG(3)                                                                  \
+        << "Cutlass can not deal with this problem size, skip this kernel!"; \
+    return status;                                                           \
+  }
+
+typedef enum {
+  CONV2D_BIAS,
+  CONV2D_BIAS_RELU,
+  CONV2D_BIAS_ADD_RELU,
+  CONV2D_BIAS_SILU,
+  CONV2D_BIAS_LEAKY_RELU
+} OpType;
+
+// conv2d_diff_gpu calculate diff of cutlass output and baseline output, you can
+// use them to debug. return value is the max diff between cutlass and baseline.
+float conv2d_diff_gpu(ConvAllParams params, OpType op_type);
+
+int ProfileToGetBestConfig(
+    const std::vector<std::function<cutlass::Status(ConvAllParams)>>& all_func,
+    ConvAllParams params,
+    OpType op_type);
+
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu b/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu
new file mode 100644
index 0000000000000000000000000000000000000000..93c5581ce9db6f3b2d50c8b1872b07cc864124ba
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h"
+
+namespace phi {
+namespace fusion {
+namespace cutlass_internal {
+template <typename T, typename Context>
+void Conv2dFusionKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& filter,
+                        const DenseTensor& bias,
+                        const paddle::optional<DenseTensor>& residual,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& padding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations,
+                        const std::string& data_format,
+                        const std::string& activation,
+                        float fuse_alpha,
+                        DenseTensor* output) {
+  ctx.template Alloc<T>(output);
+  auto in_dims = x.dims();
+  auto filter_dims = filter.dims();
+  auto out_dims = output->dims();
+  CHECK_EQ(in_dims.size() == 4UL, true);
+  CHECK_EQ(filter_dims.size() == 4UL, true);
+  CHECK_EQ(strides.size() == 2UL, true);
+  CHECK_EQ(dilations.size() == 2UL, true);
+  CHECK_EQ(groups == 1, true);
+  CHECK_EQ(padding_algorithm == "EXPLICIT", true);
+  const int batch = in_dims[0];
+  const int ic = in_dims[3];
+  const int ih = in_dims[1];
+  const int iw = in_dims[2];
+  int pad_h0 = 0;
+  int pad_h1 = 0;
+  int pad_w0 = 0;
+  int pad_w1 = 0;
+  if (paddings.size() == 2UL) {
+    pad_h0 = paddings[0];
+    pad_h1 = paddings[0];
+    pad_w0 = paddings[1];
+    pad_w1 = paddings[1];
+  } else if (paddings.size() == 4UL) {
+    pad_h0 = paddings[0];
+    pad_h1 = paddings[1];
+    pad_w0 = paddings[2];
+    pad_w1 = paddings[3];
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Attr paddins in conv2d_fusion must have 2 or 4 elements, but now have "
+        "%u elements.",
+        paddings.size()));
+  }
+
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  const int dilation_h = dilations[0];
+  const int dilation_w = dilations[1];
+  const int oc = filter_dims[0];
+  const int kh = filter_dims[1];
+  const int kw = filter_dims[2];
+
+  CHECK_EQ(out_dims.size() == 4UL, true);
+  const int oh = out_dims[1];
+  const int ow = out_dims[2];
+
+  ConvAllParams params = {reinterpret_cast<const half*>(x.data<T>()),
+                          reinterpret_cast<const half*>(filter.data<T>()),
+                          reinterpret_cast<const half*>(bias.data<T>()),
+                          nullptr,
+                          reinterpret_cast<half*>(output->data<T>()),
+                          batch,
+                          ic,
+                          ih,
+                          iw,
+                          kh,
+                          kw,
+                          oc,
+                          pad_h0,
+                          pad_h1,
+                          pad_w0,
+                          pad_w1,
+                          stride_h,
+                          stride_w,
+                          dilation_h,
+                          dilation_w,
+                          oh,
+                          ow,
+                          &ctx};
+
+  if (residual) {
+    if (activation == "relu") {
+      params.residual = reinterpret_cast<const half*>(residual->data<T>());
+      Conv2dBiasAddRelu(params);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Cutlass now only support relu activation in a residual block"));
+    }
+  } else if (activation == "relu") {
+    Conv2dBiasRelu(params);
+  } else if (activation == "swish") {
+    Conv2dBiasSilu(params);
+  } else if (activation == "identity") {
+    Conv2dBias(params);
+  } else if (activation == "leaky_relu") {
+    params.alpha = fuse_alpha;
+    Conv2dBiasLeakyRelu(params);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Cutlass does not support this activation: %s.", activation.c_str()));
+  }
+  output->set_layout(DataLayout::NHWC);
+}
+}  // namespace cutlass_internal
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_fusion_cutlass,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::cutlass_internal::Conv2dFusionKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
index e8729942b6e6c9458c072f4baccea272b5512978..35a6681b7afa3b133095b20e4ddc603847e583cd 100644
--- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
@@ -17,7 +17,28 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const IntArray& shape,
+                      DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  if (x_grad->dims() == out_grad.dims()) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  } else {
+    std::vector<int> reduce_dims =
+        funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        ctx, out_grad, x_grad, kps::IdentityFunctor<T>(), reduce_dims);
+  }
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(expand_grad,
                    GPU,
@@ -26,5 +47,6 @@ PD_REGISTER_KERNEL(expand_grad,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {}
diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu
index 27c4e82c6354440e892899f02e8e05171a504e1a..b2f973b0a8896a0eb24679bf7ee989446a7d25a7 100644
--- a/paddle/phi/kernels/gpu/expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_kernel.cu
@@ -18,7 +18,66 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/expand_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const IntArray& shape,
+                  DenseTensor* out) {
+  auto expand_shape = shape.GetData();
+  auto diff = expand_shape.size() - x.dims().size();
+  auto out_shape = phi::vectorize<int64_t>(x.dims());
+  out_shape.insert(out_shape.begin(), diff, 1);
+  for (size_t i = 0; i < out_shape.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        expand_shape[i],
+        0,
+        phi::errors::InvalidArgument("The expanded size cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          expand_shape[i],
+          0,
+          phi::errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand kernel.",
+              expand_shape[i]));
+      out_shape[i] = expand_shape[i];
+    } else if (expand_shape[i] > 0) {
+      if (out_shape[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            out_shape[i],
+            expand_shape[i],
+            phi::errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand kernel.",
+                out_shape[i],
+                expand_shape[i]));
+      } else {
+        out_shape[i] = expand_shape[i];
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          expand_shape[i],
+          -1,
+          phi::errors::InvalidArgument(
+              "When the value in shape is negative for expand_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              expand_shape[i]));
+    }
+  }
+
+  out->Resize(phi::make_ddim(out_shape));
+  ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+      ctx, ins, &outs, -1, kps::IdentityFunctor<T>());
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(expand,
                    GPU,
@@ -27,6 +86,7 @@ PD_REGISTER_KERNEL(expand,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t,
                    bool) {}
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 6e9dbf37a910015d9dfd7f701222d262f0a825b0..7945d6c8fcbafc9b70d91f5d3c34e14f1ca03fc6 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -101,6 +101,9 @@ void FlipKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   const size_t total_dims = x.dims().size();
   switch (total_dims) {
+    case 0:
+      LaunchFlipCudaKernel<T, Context, 0>(dev_ctx, x, axis, out);
+      break;
     case 1:
       LaunchFlipCudaKernel<T, Context, 1>(dev_ctx, x, axis, out);
       break;
diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc
index 22ff9b3e1a8347104bb4f7fdd4ec322c4b3dd6a6..6963d6a06d8203388a3de0fa3bbcd40cdc6a90bf 100644
--- a/paddle/phi/ops/compat/conv2d_sig.cc
+++ b/paddle/phi/ops/compat/conv2d_sig.cc
@@ -53,9 +53,24 @@ KernelSignature Conv2dDoubleGradOpArgumentMapping(
                          {"DInput", "DFilter", "DDOutput"});
 }
 
+KernelSignature Conv2dFusionArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_fusion_cutlass",
+                         {"Input", "Filter", "Bias", "ResidualData"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "activation",
+                          "fuse_alpha"},
+                         {"Output"});
+}
 }  // namespace phi
 
 PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_fusion_cutlass,
+                           phi::Conv2dFusionArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad,
                            phi::Conv2dDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc
deleted file mode 100644
index 4ca45903acfa00386c9cbfed191ddb9b50443230..0000000000000000000000000000000000000000
--- a/paddle/phi/ops/compat/squeeze_sig.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "squeeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"});
-}
-
-KernelSignature SqueezeGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "squeeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"});
-}
-
-}  // namespace phi
-PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze);
-PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad);
-PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::SqueezeOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::SqueezeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc
deleted file mode 100644
index 568097298b7acc86584b2de962e9ea06d73a26f5..0000000000000000000000000000000000000000
--- a/paddle/phi/ops/compat/unsqueeze_sig.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.InputSize("AxesTensorList") > 0) {
-    VLOG(2) << "unsqueeze2 in AxesTensorList";
-    return KernelSignature(
-        "unsqueeze_with_xshape", {"X"}, {"AxesTensorList"}, {"Out", "XShape"});
-  } else if (ctx.InputSize("AxesTensor") > 0) {
-    VLOG(2) << "unsqueeze2 in AxesTensor";
-    return KernelSignature(
-        "unsqueeze_with_xshape", {"X"}, {"AxesTensor"}, {"Out", "XShape"});
-  } else {
-    VLOG(2) << "unsqueeze2 in axes";
-    return KernelSignature(
-        "unsqueeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"});
-  }
-}
-
-KernelSignature UnsqueezeGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "unsqueeze_grad", {"XShape", "Out@GRAD"}, {}, {"X@GRAD"});
-}
-}  // namespace phi
-PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze);
-PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::UnsqueezeOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad,
-                           phi::UnsqueezeGradOpArgumentMapping);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8afa300b57ac5807bee4bfb4527ebc339b2f78a8..a234e4906ff83fa3e16a4e804092717ffafb6b91 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1843,6 +1843,11 @@ function precise_card_test_single {
     for case in $(echo $testcases | tr "$|^" "\n" | awk '!/^$/')
     do
         cd ${PADDLE_ROOT}/build
+        
+        find paddle/fluid -name *.gcda | xargs rm -f 
+        find paddle/phi -name *.gcda | xargs rm -f 
+        find paddle/utils -name *.gcda | xargs rm -f 
+
         precise_card_test "^${case}$" $num
 
         #if test failed,continue,if test succeed ,go on 
@@ -1876,9 +1881,6 @@ function precise_card_test_single {
             fi
             mv python-coverage.data.* ${PADDLE_ROOT}/build/pytest/$case
         fi
-        find paddle/fluid -name *.gcda | xargs rm -f 
-        find paddle/phi -name *.gcda | xargs rm -f 
-        find paddle/utils -name *.gcda | xargs rm -f 
     done
 }
 
@@ -1988,6 +1990,10 @@ set +x
             fi
             read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
+            if [[ "$testcase" == "simple_precision_test" ]]; then
+                continue
+            fi
+
             if [[ "$is_multicard" == "" ]]; then
                 # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
                 read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
@@ -2032,6 +2038,8 @@ set -x
     mkdir -p ${PADDLE_ROOT}/build/ut_map
     mkdir -p ${PADDLE_ROOT}/build/pytest
     #run all unittest to get the coverage information of .c and .h files
+    precise_card_test_single "^simple_precision_test$" 1
+    wait;
     precise_card_test_single "$single_card_tests" 1
     precise_card_test_single "$single_card_tests_1" 1
     precise_card_test_single "$multiple_card_tests" 2
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 3f41ebaa96d07aa10de6e38bf2c80791f2c4b24d..5c97fe90a2e1771dd8edb47d54df54d3eaa51e99 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -20,11 +20,11 @@ __all__ = []
 
 import paddle
 from paddle.common_ops_import import LayerHelper
-from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.optimizer import Momentum, Optimizer
 from paddle.framework import core
+from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
 from paddle.static import create_global_var
 
 
@@ -76,9 +76,9 @@ class DGCMomentumOptimizer(Optimizer):
 
         self._dgc_clip_norm = None
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipByNorm):
+            if not isinstance(grad_clip, ClipGradByNorm):
                 raise TypeError(
-                    "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm"
+                    "The type of grad_clip should be 'ClipGradByNorm', because DGCMomentumOptimizer only support ClipGradByNorm"
                 )
             assert isinstance(num_trainers, int), (
                 "The type of num_trainers should be 'int', but received %s"
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 75f0061b2ca20be4c7f4f7dc10bf3c48a8374366..9eca2e667a8fd8c81aa3a4b1083ada9204cbecb6 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -15,9 +15,8 @@
 import paddle
 from paddle import framework
 from paddle.autograd import no_grad
-from paddle.fluid import layers
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.framework import core
+from paddle.nn import ClipGradByGlobalNorm, clip
 
 from ...base.topology import ParallelMode
 from ...utils.hybrid_parallel_util import (
@@ -62,8 +61,8 @@ class HybridParallelClipGrad:
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+                merge_grad = clip.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
             square = paddle.square(merge_grad)
             sum_square = paddle.sum(square)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index b1a572d4edfc30d9fdccc45b1b056ef7411cf44d..9a25d7c4912bacc49c727c09958c1daaaf5c7c0c 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -30,7 +30,7 @@ import paddle
 import paddle.distributed as dist
 from paddle.distributed import ParallelMode, fleet
 from paddle.fluid import core
-from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.nn import ClipGradByGlobalNorm
 from paddle.optimizer import Optimizer
 
 HybridParallelClipGrad = (
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 3d3debb252d400ddf3962f064682cf1b829af131..d99683d481450309d95d13dfb26b0bc3471ea5e3 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -25,8 +25,8 @@ import paddle.fluid.framework as framework
 from paddle import nn
 from paddle.autograd import PyLayer
 from paddle.distributed import collective
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.framework import EagerParamBase
+from paddle.nn import ClipGradByGlobalNorm
 
 from .group_sharded_storage import GradStorage
 from .group_sharded_utils import GroupShardedClipGrad, Type, device_guard
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 620540fea58761f8930b33bd8d65f6bafc7ff369..f8c86e02b7b52490dde4ad3c69068b9709c39250 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -23,6 +23,7 @@ from paddle import _legacy_C_ops
 from paddle.fluid import core, layers
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import dygraph_only
+from paddle.nn import clip
 
 
 class Taskflow:
@@ -65,8 +66,8 @@ class GroupShardedClipGrad:
 
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.get_tensor_from_selected_rows(
-                    layers.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(
+                    clip.merge_selected_rows(g)
                 )
             square = paddle.square(merge_grad)
             sum_square = paddle.sum(square)
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 8c15e47307381d862b15518cf860e34d4f9c4280..39284fa9f5a3f151747547b42409385d470571cd 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None):
         .. code-block:: python
 
           # in model.py
-          similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(output, min=-15.0, max=15.0))
+          similarity_norm = fluid.layers.sigmoid(paddle.clip(output, min=-15.0, max=15.0))
           binary_predict = fluid.layers.concat(
               input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
           self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 53d50a8b4a3ed378aa203f9458a4dc440e080716..eaf64e6dc6c0bd3c4bd0f3642f32953e52a81ea3 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -90,7 +90,6 @@ from .transpiler import (
     DistributeTranspilerConfig,
 )
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
-from . import clip
 from . import profiler
 from . import unique_name
 from . import parallel_executor
@@ -99,7 +98,6 @@ from . import compiler
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 from . import install_check
-from .dygraph.nn import *
 from .dygraph.layers import *
 from .dygraph.base import enable_dygraph, disable_dygraph
 from .io import save, load, load_program_state, set_program_state
@@ -165,7 +163,6 @@ __all__ = (
         'ParamAttr',
         'WeightNormParamAttr',
         'DataFeeder',
-        'clip',
         'profiler',
         'unique_name',
         'Scope',
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
deleted file mode 100644
index ffaa84ed3e53c5aadbb6dc3e8d51a48bc00a9fb6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/clip.py
+++ /dev/null
@@ -1,944 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import warnings
-
-import functools
-import paddle
-from . import layers
-from . import framework
-from . import core
-from . import name_scope
-from .dygraph import base as imperative_base
-from .data_feeder import check_variable_and_dtype
-from .framework import in_dygraph_mode
-from .layer_helper import LayerHelper
-from .framework import default_main_program
-from paddle import _C_ops, _legacy_C_ops
-
-__all__ = [
-    'set_gradient_clip',
-    'ErrorClipByValue',
-    'ClipGradByValue',
-    'ClipGradByNorm',
-    'ClipGradByGlobalNorm',
-]
-
-_clip_by_global_norm_using_mp_type_flag = False
-
-
-def _clip_by_global_norm_using_mp_type(*args):
-    global _clip_by_global_norm_using_mp_type_flag
-    assert len(args) <= 1
-    if len(args) == 1:
-        assert isinstance(args[0], bool)
-        old_value = _clip_by_global_norm_using_mp_type_flag
-        _clip_by_global_norm_using_mp_type_flag = args[0]
-        return old_value
-    else:
-        return _clip_by_global_norm_using_mp_type_flag
-
-
-def _cast_to_mp_type_if_enabled(x):
-    if (
-        x.dtype == core.VarDesc.VarType.FP16
-        or x.dtype == core.VarDesc.VarType.BF16
-    ) and _clip_by_global_norm_using_mp_type():
-        return x.astype(core.VarDesc.VarType.FP32)
-    else:
-        return x
-
-
-def _squared_l2_norm(x):
-    r"""
-    This OP returns the squared L2 norm of a tensor.
-    """
-
-    x = _cast_to_mp_type_if_enabled(x)
-    if (
-        core.is_compiled_with_xpu()
-        or x.dtype == core.VarDesc.VarType.FP16
-        or x.dtype == core.VarDesc.VarType.BF16
-    ):
-        square = paddle.square(x)
-        sum_square = paddle.sum(square)
-        return sum_square
-
-    if in_dygraph_mode():
-        return _C_ops.squared_l2_norm(x)
-    else:
-        op_type = 'squared_l2_norm'
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
-        helper = LayerHelper(op_type, **locals())
-        out = helper.create_variable_for_type_inference(x.dtype)
-
-        inputs = {"X": x}
-        outputs = {'Out': out}
-        helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
-        return out
-
-
-class BaseErrorClipAttr:
-    def __str__(self):
-        raise NotImplementedError()
-
-    def _append_clip_op(self, block, grad_name):
-        raise NotImplementedError()
-
-
-class ErrorClipByValue(BaseErrorClipAttr):
-    r"""
-    Clips tensor values to the range [min, max].
-
-    Given a tensor ``t`` (see Examples below), this operation clips its value \
-    to ``min`` and ``max`` inplace.
-
-    - Any values less than min are set to min.
-    - Any values greater than max are set to max.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to ``-max`` by framework.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            BATCH_SIZE = 128
-            CLIP_MAX = 2e-6
-            CLIP_MIN = -1e-6
-            prog = fluid.framework.Program()
-            with fluid.program_guard(main_program=prog):
-                image = fluid.layers.data(
-                    name='x', shape=[784], dtype='float32')
-                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                predict = fluid.layers.fc(
-                    input=hidden2, size=10, act='softmax')
-                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                cost = paddle.nn.functional.cross_entropy(input=predict, label=label, reduction='none', use_softmax=False)
-                avg_cost = paddle.mean(cost)
-            prog_clip = prog.clone()
-            prog_clip.block(0).var(hidden1.name)._set_error_clip(
-                fluid.clip.ErrorClipByValue(
-                    max=CLIP_MAX, min=CLIP_MIN
-                )
-            )
-    """
-
-    def __init__(self, max, min=None):
-        max = float(max)
-        if min is None:
-            min = -max
-        else:
-            min = float(min)
-        self.max = max
-        self.min = min
-
-    def __str__(self):
-        return "ByValue, min=%f, max=%f" % (self.min, self.max)
-
-    def _append_clip_op(self, block, grad_name):
-        clip_op_desc = block.desc.append_op()
-        clip_op_desc.set_type("clip")
-        clip_op_desc.set_input("X", [grad_name])
-        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc._set_attr("min", self.min)
-        clip_op_desc._set_attr("max", self.max)
-
-
-def error_clip_callback(block, context):
-    # the context is a grad_to_var map
-    grad_to_var = context
-    op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
-        fwd_var = block._var_recursive(grad_to_var[grad_n])
-        error_clip = getattr(fwd_var, "error_clip", None)
-        if not (
-            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
-        ):
-            raise TypeError(
-                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
-            )
-        if error_clip is not None:
-            error_clip._append_clip_op(block, grad_n)
-
-
-class ClipGradBase:
-    def __init__(self):
-        super().__init__()
-
-    def __str__(self):
-        raise NotImplementedError()
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        raise NotImplementedError
-
-    def _static_clip(self, params_grads):
-        raise NotImplementedError
-
-    def __call__(self, params_grads):
-        if in_dygraph_mode():
-            return self._dygraph_clip(params_grads)
-        else:
-            for p, g in params_grads:
-                if getattr(p, 'gradient_clip_attr', None) is not None:
-                    warnings.warn(
-                        "'set_gradient_clip' will be ineffective, because you have "
-                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
-                        "is redundant and you can remove it."
-                    )
-                    break
-            return self._static_clip(params_grads)
-
-    def _process_context(self, context, param, grad):
-        raise NotImplementedError()
-
-    def _create_operators(self, param, grad):
-        raise NotImplementedError()
-
-
-class ClipGradByValue(ClipGradBase):
-    """
-    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
-
-    - Any values less than min are set to ``min``.
-
-    - Any values greater than max are set to ``max``.
-
-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    Note:
-        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
-            automatically. In this case, ``max`` must be greater than 0.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(self, max, min=None):
-        super().__init__()
-        if min is None:
-            assert max > 0.0
-            min = -max
-        self.max = float(max)
-        self.min = float(min)
-
-    def __str__(self):
-        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = paddle.clip(x=g, min=self.min, max=self.max)
-            params_and_grads.append((p, new_grad))
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        param_new_grad_name_dict = dict()
-        with framework.name_scope('gradient_clip'):
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.clip(x=g, min=self.min, max=self.max)
-                params_and_grads.append((p, new_grad))
-                param_new_grad_name_dict[p.name] = new_grad.name
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
-        return param, new_grad
-
-
-class ClipGradByNorm(ClipGradBase):
-    r"""
-    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
-
-    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
-
-    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
-
-    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    The clipping formula is:
-
-    .. math::
-        Out =
-        \left\{
-            \begin{array}{ccl}
-                X & & if (norm(X) \leq clip\_norm) \\
-                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
-        \end{array}
-        \right.
-
-
-    where :math:`norm(X)` represents the L2 norm of :math:`X`.
-
-    .. math::
-        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
-
-    Note:
-        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        clip_norm(float): The maximum norm value.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(self, clip_norm):
-        super().__init__()
-        self.clip_norm = float(clip_norm)
-
-    def __str__(self):
-        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-            params_and_grads.append((p, new_grad))
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        with framework.name_scope('gradient_clip'):
-            param_new_grad_name_dict = dict()
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-                param_new_grad_name_dict[p.name] = new_grad.name
-                params_and_grads.append((p, new_grad))
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
-        return param, new_grad
-
-
-_allow_pure_fp16_global_norm_clip_flag = False
-
-
-def _allow_pure_fp16_global_norm_clip(*args):
-    global _allow_pure_fp16_global_norm_clip_flag
-    if len(args) == 0:
-        return _allow_pure_fp16_global_norm_clip_flag
-    else:
-        assert len(args) == 1 and isinstance(args[0], bool)
-        old_value = _allow_pure_fp16_global_norm_clip_flag
-        _allow_pure_fp16_global_norm_clip_flag = args[0]
-        return old_value
-
-
-class ClipGradByGlobalNorm(ClipGradBase):
-    r"""
-    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
-    :math:`t\_list` , and limit it to ``clip_norm`` .
-
-    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
-
-    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
-
-    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    The clipping formula is:
-
-    .. math::
-
-        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
-
-    where:
-
-    .. math::
-
-        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
-
-    Note:
-        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        clip_norm (float): The maximum norm value.
-        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(
-        self, clip_norm, group_name="default_group", auto_skip_clip=False
-    ):
-        super().__init__()
-        self.clip_norm = float(clip_norm)
-        self.group_name = group_name
-        assert isinstance(auto_skip_clip, bool)
-        self.auto_skip_clip = auto_skip_clip
-
-    def __str__(self):
-        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        sum_square_list_fp16 = []
-        sum_square_list_fp32 = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                continue
-            merge_grad = g
-
-            if in_dygraph_mode() and g.is_selected_rows():
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = merge_grad._get_tensor_from_selected_rows()
-
-            elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-
-            sum_square = _squared_l2_norm(merge_grad)
-            if (
-                sum_square.dtype == core.VarDesc.VarType.FP16
-                or sum_square.dtype == core.VarDesc.VarType.BF16
-            ):
-                sum_square_list_fp16.append(sum_square)
-            elif sum_square.dtype == core.VarDesc.VarType.FP32:
-                sum_square_list_fp32.append(sum_square)
-            else:
-                sum_square_list.append(sum_square)
-
-        # all parameters have been filterd out
-        if (
-            len(sum_square_list)
-            + len(sum_square_list_fp16)
-            + len(sum_square_list_fp32)
-            == 0
-        ):
-            return params_grads
-
-        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
-        global_norm_var = []
-        if len(sum_square_list_fp16) > 0:
-            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
-            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
-        if len(sum_square_list_fp32) > 0:
-            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
-            if sum_dtype == 'float32':
-                global_norm_var.append(global_norm_var_fp32)
-            else:
-                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
-        if len(sum_square_list) > 0:
-            global_norm_var_fp64 = paddle.add_n(sum_square_list)
-            global_norm_var.append(global_norm_var_fp64)
-        global_norm_var = paddle.add_n(global_norm_var)
-        global_norm_var = paddle.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
-        )
-
-        need_clip = False
-        if not self.auto_skip_clip:  # always apply clip
-            need_clip = True
-            clip_var = paddle.divide(
-                x=max_global_norm,
-                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
-            )
-        elif global_norm_var > max_global_norm:
-            # only when global_norm_var > max_global_norm, grad need clip
-            need_clip = True
-            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
-
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            # TODO(wangxi): use inplace elementwise_mul
-            if need_clip:
-                clip_input = (
-                    clip_var.astype(g.dtype)
-                    if clip_var.dtype != g.dtype
-                    else clip_var
-                )
-                new_grad = paddle.multiply(g, clip_input)
-                params_and_grads.append((p, new_grad))
-            else:
-                params_and_grads.append((p, g))
-
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        sum_square_list_fp16 = []
-        sum_square_list_fp32 = []
-        with framework.name_scope('gradient_clip'):
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    continue
-                merge_grad = g
-                with p.block.program._optimized_guard([p, g]):
-                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                        merge_grad = layers.merge_selected_rows(g)
-                        merge_grad = layers.get_tensor_from_selected_rows(
-                            merge_grad
-                        )
-                    sum_square = _squared_l2_norm(merge_grad)
-                    if sum_square.dtype == core.VarDesc.VarType.FP16:
-                        sum_square_list_fp16.append(sum_square)
-                    elif sum_square.dtype == core.VarDesc.VarType.FP32:
-                        sum_square_list_fp32.append(sum_square)
-                    else:
-                        sum_square_list.append(sum_square)
-
-            # all parameters have been filterd out
-            if (
-                len(sum_square_list)
-                + len(sum_square_list_fp16)
-                + len(sum_square_list_fp32)
-                == 0
-            ):
-                return params_grads
-
-            with p.block.program._optimized_guard([p, g]):
-                sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
-
-                global_norm_var = []
-                if len(sum_square_list_fp16) > 0:
-                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
-                    if (
-                        sum_square_list_fp32
-                        or sum_square_list
-                        or not _allow_pure_fp16_global_norm_clip()
-                    ):
-                        global_norm_var.append(
-                            global_norm_var_fp16.astype(sum_dtype)
-                        )
-                    else:
-                        global_norm_var.append(global_norm_var_fp16)
-                if len(sum_square_list_fp32) > 0:
-                    global_norm_var_fp32 = layers.sums(sum_square_list_fp32)
-                    if sum_dtype == 'float32':
-                        global_norm_var.append(global_norm_var_fp32)
-                    else:
-                        global_norm_var.append(
-                            global_norm_var_fp32.astype(sum_dtype)
-                        )
-                if len(sum_square_list) > 0:
-                    # fp64
-                    global_norm_var_other_dtype = layers.sums(sum_square_list)
-                    global_norm_var.append(global_norm_var_other_dtype)
-
-                global_norm_var = (
-                    layers.sums(global_norm_var)
-                    if len(global_norm_var) > 1
-                    else global_norm_var[0]
-                )
-                global_norm_var = paddle.sqrt(x=global_norm_var)
-                max_global_norm = layers.fill_constant(
-                    shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
-                )
-                scale_var = paddle.divide(
-                    x=max_global_norm,
-                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
-                )
-            param_new_grad_name_dict = dict()
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_g = _cast_to_mp_type_if_enabled(g)
-                    # inplace
-                    scale_input = (
-                        scale_var.astype('float16')
-                        if new_g.dtype == core.VarDesc.VarType.FP16
-                        and scale_var.dtype != core.VarDesc.VarType.FP16
-                        else scale_var
-                    )
-                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
-                    # will be in different blocks with the gradient clip related ops.
-                    # We need to handle the correct block, otherwise will encounter
-                    # a 'NotFoundError' during compile time.
-                    block = default_main_program().current_block()
-                    block.append_op(
-                        type='elementwise_mul',
-                        inputs={'X': new_g, 'Y': scale_input},
-                        outputs={'Out': new_g},
-                    )
-                    if new_g is not g:
-                        block.append_op(
-                            type='cast',
-                            inputs={'X': new_g},
-                            outputs={'Out': g},
-                            attrs={
-                                'in_dtype': new_g.dtype,
-                                'out_dtype': g.dtype,
-                            },
-                        )
-
-                param_new_grad_name_dict[p.name] = g.name
-                params_and_grads.append((p, g))
-
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        if self.group_name not in context:
-            context[self.group_name] = []
-            context[self.group_name + "_clip_value"] = self.clip_norm
-            context[self.group_name + "_clip"] = layers.fill_constant(
-                shape=[1], dtype=grad.dtype, value=self.clip_norm
-            )
-        else:
-            if not self.clip_norm == context[self.group_name + "_clip_value"]:
-                raise ValueError(
-                    "All parameters' 'clip_norm' of a same group should be the same"
-                )
-
-        merge_grad = grad
-        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-            merge_grad = layers.merge_selected_rows(grad)
-            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-
-        local_norm_var = _squared_l2_norm(merge_grad)
-        context[self.group_name].append(local_norm_var)
-
-        self.context = context
-
-    def _create_operators(self, param, grad):
-        group_scale_name = self.group_name + "_scale"
-        if group_scale_name not in self.context:
-            group_norm_var = layers.sums(input=self.context[self.group_name])
-            group_norm_var = paddle.sqrt(x=group_norm_var)
-            clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = paddle.divide(
-                x=clip_var,
-                y=paddle.maximum(x=clip_var, y=group_norm_var),
-            )
-            assert group_scale_var.shape == (1,)
-            self.context[group_scale_name] = group_scale_var
-
-        # inplace
-        param.block.append_op(
-            type='elementwise_mul',
-            inputs={'X': grad, 'Y': self.context[group_scale_name]},
-            outputs={'Out': grad},
-        )
-
-        return param, grad
-
-
-@framework.dygraph_not_support
-def set_gradient_clip(clip, param_list=None, program=None):
-    """
-    :api_attr: Static Graph
-
-    Warning:
-
-        This API must be used after building network, and before ``minimize`` ,
-        and it may be removed in future releases, so it is not recommended.
-        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
-        this is a better method to clip gradient. There are three clipping strategies:
-         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
-         :ref:`api_fluid_clip_GradientClipByValue` .
-
-    To specify parameters that require gradient clip.
-
-    Args:
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
-            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
-            gradient clipping.
-        param_list (list(Variable), optional): Parameters that require gradient clip.
-                It can be a list of parameter or a list of parameter's name.
-                Default None, meaning that all parameters in the program will be included.
-        program (Program, optional): The program where parameters are located.
-                Default None, meaning that using :ref:`api_fluid_default_main_program` .
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            def network():
-                image = fluid.data(name='image', shape=[
-                                   None, 28], dtype='float32')
-                param_attr1 = fluid.ParamAttr("fc1_param")
-                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
-                param_attr2 = fluid.ParamAttr("fc2_param")
-                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
-                loss = fluid.layers.reduce_mean(fc2)
-                return loss
-
-
-            # network 1: clip all parameter gradient
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 2: clip parameter gradient by name
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=["fc1_param", "fc2_param"])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 3: clip parameter gradient by value
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
-                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=[param_var1, param_var2])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
-                clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0)
-                # Set the gradient clipping strategy: clip1
-                fluid.clip.set_gradient_clip(clip1)
-                # Set the gradient clipping strategy: clip2
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
-                sgd.minimize(loss)
-                # 'set_gradient_clip' will not take effect when setting has a conflict,
-                # and the gradient clipping strategy will be 'clip2'
-
-
-    """
-    warnings.warn(
-        "Caution! 'set_gradient_clip' is not recommended "
-        "and may be deprecated in future! "
-        "We recommend a new strategy: set 'grad_clip' "
-        "when initializing the 'optimizer'. "
-        "This method can reduce the mistakes, please "
-        "refer to documention of 'optimizer'."
-    )
-
-    if not isinstance(clip, ClipGradBase):
-        raise TypeError(
-            "'clip' should be an instance of ClipGradBase's derived class"
-        )
-    if program is None:
-        program = framework.default_main_program()
-
-    for op in program.block(0).ops:
-        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
-            "op_namescope"
-        ):
-            warnings.warn(
-                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
-                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
-            )
-            break
-
-    if param_list is None:
-        param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, str) for elem in param_list):
-        param_list = [program.block(0).var(elem) for elem in param_list]
-    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
-        raise TypeError(
-            "'param_list' should be a list of Parameter or basestring(parameter's name)."
-        )
-
-    for param in param_list:
-        param.gradient_clip_attr = copy.deepcopy(clip)
-
-
-def append_gradient_clip_ops(param_grads):
-    context = dict()
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard([p, g]), framework.name_scope(
-            'gradient_clip'
-        ):
-            clip_attr = getattr(p, 'gradient_clip_attr', None)
-            if clip_attr is None:
-                return param_grads
-            if not isinstance(clip_attr, ClipGradBase):
-                raise TypeError(
-                    "clip attribute should be an instance of GradientClipBase"
-                )
-
-            clip_attr._process_context(context=context, param=p, grad=g)
-
-    res = []
-    param_new_grad_name_dict = dict()
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard([p, g]), framework.name_scope(
-            'gradient_clip'
-        ):
-            param, new_grad = clip_attr._create_operators(param=p, grad=g)
-            param_new_grad_name_dict[param.name] = new_grad.name
-            res.append([param, new_grad])
-
-    _correct_clip_op_role_var(res, param_new_grad_name_dict)
-    return res
-
-
-# change wrong mapping relation between param & grad in clip op
-# Note: This function is sensitive to the time cost of the network with gradient clipping
-# and should not be changed easily. If you must change, please test the time cost.
-def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
-    block_id_list = []
-    if len(param_new_grad_name_dict) == 0:
-        return
-    for param, grad in params_grads:
-        if grad is None:
-            continue
-        block_id = param.block.idx
-        if block_id in block_id_list:
-            continue
-        block_id_list.append(block_id)
-        for op in param.block.program.global_block().ops:
-            if (
-                op.has_attr("op_namescope")
-                and "gradient_clip" in op.attr("op_namescope")
-                and op.attr('op_role_var')
-            ):
-                param_name = op.attr('op_role_var')[0]
-                if param_name in param_new_grad_name_dict:
-                    correct_p_g = [
-                        param_name,
-                        param_new_grad_name_dict[param_name],
-                    ]
-                    op._set_attr('op_role_var', correct_p_g)
-
-
-GradientClipBase = ClipGradBase
-GradientClipByValue = ClipGradByValue
-GradientClipByNorm = ClipGradByNorm
-GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index aebcc09eaa14ba8242b7ea4041f5816a5259cc7c..b98c188ae4f6ab3ecd191940431bb86d84ddccc7 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -21,9 +21,6 @@ from .layers import *
 from . import container
 from .container import *
 
-from . import nn
-from .nn import *
-
 from . import tracer
 from .tracer import *
 
@@ -45,7 +42,6 @@ __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
 __all__ += container.__all__
-__all__ += nn.__all__
 __all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
deleted file mode 100644
index f6009912bee9062a4a8478237a41c9168af50782..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/dygraph/nn.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from .. import core
-from ..layers import utils
-from ..layers import nn as F
-from .. import dygraph_utils
-from . import layers
-from ..framework import (
-    Variable,
-    OpProtoHolder,
-    Parameter,
-    _dygraph_tracer,
-    _varbase_creator,
-    default_main_program,
-    _global_flags,
-    in_dygraph_mode,
-)
-
-from ..data_feeder import (
-    convert_dtype,
-    check_variable_and_dtype,
-    check_type,
-    check_dtype,
-)
-
-from ..param_attr import ParamAttr
-from ..initializer import Normal, Constant, NumpyArrayInitializer
-from .. import unique_name
-from .layer_object_helper import LayerObjectHelper
-from ..data_feeder import check_variable_and_dtype, check_type
-import numpy as np
-import numbers
-import logging
-import os
-import paddle.utils.deprecated as deprecated
-from paddle import _C_ops, _legacy_C_ops
-
-__all__ = []
-
-
-class BatchNorm(layers.Layer):
-    r"""
-
-    This interface is used to construct a callable object of the ``BatchNorm`` class.
-    For more details, refer to code examples.
-    It implements the function of the Batch Normalization Layer and can be used
-    as a normalizer function for conv2d and fully connected operations.
-    The data is normalized by the mean and variance of the channel based on the current batch data.
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    When use_global_stats = False, the :math:`\mu_{\beta}`
-    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
-    Calculated as follows:
-
-    ..  math::
-
-        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
-        //\ mini-batch\ mean \\
-        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
-        //\ mini-batch\ variance \\
-
-    - :math:`x` : mini-batch data
-    - :math:`m` : the size of the mini-batch data
-
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global or running statistics (moving_mean and moving_variance). It usually got from the
-    pre-trained model. Calculated as follows:
-
-    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
-
-    The normalization function formula is as follows:
-
-    ..  math::
-
-        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-
-
-    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\gamma` : trainable proportional parameter
-    - :math:`\beta` : trainable deviation parameter
-
-    Parameters:
-        num_channels(int): Indicate the number of channels of the input ``Tensor``.
-        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
-        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
-             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
-             Default: False.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        dtype(str, optional): Indicate the data type of the input ``Tensor``,
-             which can be float32 or float64. Default: float32.
-        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW.
-        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
-        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
-        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
-        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
-            average when model average is enabled. Default: True.
-        use_global_stats(bool, optional): Whether to use global mean and
-            variance. In inference or test mode, set use_global_stats to true
-            or is_test to true, and the behavior is equivalent.
-            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period. Default: False.
-        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
-            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
-            Default: False.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
-
-          x = paddle.rand([3, 10, 3, 7], 'float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = fluid.BatchNorm(10)
-              hidden1 = batch_norm(x)
-    """
-
-    def __init__(
-        self,
-        num_channels,
-        act=None,
-        is_test=False,
-        momentum=0.9,
-        epsilon=1e-05,
-        param_attr=None,
-        bias_attr=None,
-        dtype='float32',
-        data_layout='NCHW',
-        in_place=False,
-        moving_mean_name=None,
-        moving_variance_name=None,
-        do_model_average_for_mean_and_var=True,
-        use_global_stats=False,
-        trainable_statistics=False,
-    ):
-        super().__init__()
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-
-        assert (
-            bias_attr is not False
-        ), "bias_attr should not be False in batch_norm."
-
-        if dtype == "float16":
-            self._dtype = "float32"
-        else:
-            self._dtype = dtype
-
-        param_shape = [num_channels]
-
-        # create parameter
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0),
-        )
-        self.weight.stop_gradient = (
-            use_global_stats and self._param_attr.learning_rate == 0.0
-        )
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True,
-        )
-        self.bias.stop_gradient = (
-            use_global_stats and self._param_attr.learning_rate == 0.0
-        )
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_mean_name,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var,
-            ),
-            shape=param_shape,
-            dtype=self._dtype,
-        )
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_variance_name,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var,
-            ),
-            shape=param_shape,
-            dtype=self._dtype,
-        )
-        self._variance.stop_gradient = True
-
-        self._in_place = in_place
-        self._data_layout = data_layout
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._is_test = is_test
-        self._fuse_with_relu = False
-        self._use_global_stats = use_global_stats
-        self._trainable_statistics = trainable_statistics
-
-    def forward(self, input):
-        # create output
-        # mean and mean_out share the same memory
-        mean_out = self._mean
-        # variance and variance out share the same memory
-        variance_out = self._variance
-
-        if in_dygraph_mode():
-            batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
-                input,
-                self._mean,
-                self._variance,
-                self.weight,
-                self.bias,
-                not self.training,
-                self._momentum,
-                self._epsilon,
-                self._data_layout,
-                self._use_global_stats,
-                self._trainable_statistics,
-            )
-            return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
-            )
-        else:
-            check_variable_and_dtype(
-                input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
-            )
-
-            attrs = {
-                "momentum": self._momentum,
-                "epsilon": self._epsilon,
-                "is_test": self._is_test,
-                "data_layout": self._data_layout,
-                "use_mkldnn": False,
-                "fuse_with_relu": self._fuse_with_relu,
-                "use_global_stats": self._use_global_stats,
-                "trainable_statistics": self._trainable_statistics,
-            }
-
-            inputs = {
-                "X": [input],
-                "Scale": [self.weight],
-                "Bias": [self.bias],
-                "Mean": [self._mean],
-                "Variance": [self._variance],
-            }
-
-            saved_mean = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True
-            )
-            saved_variance = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True
-            )
-            reserve_space = self._helper.create_variable_for_type_inference(
-                dtype=self._helper.input_dtype(input), stop_gradient=True
-            )
-
-            batch_norm_out = (
-                input
-                if self._in_place
-                else self._helper.create_variable_for_type_inference(
-                    self._dtype
-                )
-            )
-
-            outputs = {
-                "Y": [batch_norm_out],
-                "MeanOut": [mean_out],
-                "VarianceOut": [variance_out],
-                "SavedMean": [saved_mean],
-                "SavedVariance": [saved_variance],
-            }
-            if reserve_space is not None:
-                outputs["ReserveSpace"] = [reserve_space]
-
-            self._helper.append_op(
-                type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
-            )
-
-            # Currently, we don't support inplace in dygraph mode
-            return self._helper.append_activation(batch_norm_out, self._act)
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index df198931199f59520368faee025a77b42b5bdcd7..4ec3c1d16e077ea00672c664bac3b1b4ea5e491c 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -185,7 +185,7 @@ class FleetUtil:
 
               # below is part of model
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
@@ -1374,7 +1374,7 @@ class FleetUtil:
               label = fluid.layers.data(name="click", shape=[-1, 1],\
                   dtype="int64", lod_level=0, append_batch_size=False)
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
@@ -1574,7 +1574,7 @@ class FleetUtil:
               label = fluid.layers.data(name="click", shape=[-1, 1],\
                   dtype="int64", lod_level=0, append_batch_size=False)
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 5756361f89e46f005072d2136b2e13de4762525b..bf1ad9b107f74694c80472f583287d617fdf0616 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -25,7 +25,7 @@ from .param_attr import ParamAttr
 from .initializer import Constant
 from . import layers
 from . import backward
-from .dygraph import Layer, nn
+from .dygraph import Layer
 from . import executor
 from . import optimizer
 from . import core
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 264c8ce6da94e8af7fb1d3b27c429880983f8bf8..c11a541df5326794a72390086442664aee26a142 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -63,10 +63,6 @@ __all__ = [
     'fc',
     'embedding',
     'autoincreased_step_counter',
-    'clip',
-    'clip_by_norm',
-    'merge_selected_rows',
-    'get_tensor_from_selected_rows',
 ]
 
 OP_NAMEMAPPING = {
@@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
             )
 
         return out
-
-
-@templatedoc()
-def clip(x, min, max, name=None):
-    """
-        :old_api: paddle.fluid.layers.clip
-
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        min(float): ${min_comment}
-        max(float): ${max_comment}
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        ${out_comment}
-
-    Return Type:
-        ${out_type}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.data(
-                name='data', shape=[1], dtype='float32')
-            reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
-    """
-
-    helper = LayerHelper("clip", **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
-
-    if name is None:
-        name = unique_name.generate_with_ignorable_key(
-            ".".join([helper.name, 'tmp'])
-        )
-
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False
-    )
-
-    helper.append_op(
-        type="clip",
-        inputs={"X": x},
-        attrs={"min": min, "max": max},
-        outputs={"Out": out},
-    )
-
-    return out
-
-
-@templatedoc()
-def clip_by_norm(x, max_norm, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        max_norm(${max_norm_type}): ${max_norm_comment}
-        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and
-            None by default.
-
-    Returns:
-        Tensor:
-
-        out(${out_type}): ${out_comment}
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-
-            input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
-            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
-            # [[0.5, 0.5], [0.5, 0.5]]
-    """
-
-    if in_dygraph_mode():
-        return _C_ops.clip_by_norm(x, max_norm)
-    else:
-        helper = LayerHelper("clip_by_norm", **locals())
-        check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
-        check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
-
-        if name is None:
-            name = unique_name.generate_with_ignorable_key(
-                ".".join([helper.name, 'tmp'])
-            )
-
-        out = helper.create_variable(
-            type=x.type, name=name, dtype=x.dtype, persistable=False
-        )
-
-        helper.append_op(
-            type="clip_by_norm",
-            inputs={"X": x},
-            attrs={"max_norm": max_norm},
-            outputs={"Out": out},
-        )
-
-        return out
-
-
-@templatedoc()
-def merge_selected_rows(x, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            var = b.create_var(
-                name="X", dtype="float32", persistable=True,
-                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            y = fluid.layers.merge_selected_rows(var)
-    """
-    if in_dygraph_mode():
-        return _C_ops.merge_selected_rows(x)
-    else:
-        helper = LayerHelper("merge_selected_rows", **locals())
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(
-            type="merge_selected_rows",
-            inputs={"X": x},
-            attrs={},
-            outputs={"Out": out},
-        )
-        return out
-
-
-@templatedoc()
-def get_tensor_from_selected_rows(x, name=None):
-    """
-    This operator gets tensor data from input with SelectedRows type, and outputs a LoDTensor.
-
-    .. code-block:: text
-
-        input x is SelectedRows:
-           x.rows = [0, 5, 5, 4, 19]
-           x.height = 20
-           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
-
-        Output is LoDTensor:
-           out.shape = [5, 2]
-           out.data = [[1, 1],
-                       [2, 2],
-                       [2, 2],
-                       [3, 3],
-                       [6, 6]]
-
-    Args:
-        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            out = fluid.layers.get_tensor_from_selected_rows(input)
-    """
-
-    check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
-    if x.type != core.VarDesc.VarType.SELECTED_ROWS:
-        raise TypeError(
-            "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
-        )
-    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='get_tensor_from_selected_rows',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={},
-    )
-    return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 3e59ca2472ab4decea0dc6f93cbb5fa1492ec0c4..cbbe8dbadef12f6584371c7fcd500b2c3b0b7c5a 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -38,13 +38,6 @@ from .backward import (
     _append_grad_suffix_,
     _get_no_grad_set_name,
 )
-from .clip import (
-    GradientClipBase,
-    GradientClipByNorm,
-    error_clip_callback,
-    append_gradient_clip_ops,
-    ClipGradByGlobalNorm,
-)
 from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
@@ -160,7 +153,7 @@ class Optimizer:
                 )
 
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipBase):
+            if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
                 raise TypeError(
                     "'grad_clip' should be an instance of GradientClipBase's derived class"
                 )
@@ -1030,7 +1023,7 @@ class Optimizer:
                     params_grads.append((param, grad_var))
         else:
             if callbacks is None:
-                callbacks = [error_clip_callback]
+                callbacks = [paddle.nn.clip.error_clip_callback]
             else:
                 assert isinstance(callbacks, list)
             program = loss.block.program
@@ -1260,7 +1253,7 @@ class Optimizer:
         # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
         if self._flatten_param_grads and self.regularization is None:
             if self._grad_clip is None or isinstance(
-                self._grad_clip, ClipGradByGlobalNorm
+                self._grad_clip, paddle.nn.ClipGradByGlobalNorm
             ):
                 params_grads = self.flatten_param_grads(params_grads)
 
@@ -1268,7 +1261,7 @@ class Optimizer:
         if self._grad_clip is not None:
             params_grads = self._grad_clip(params_grads)
         else:
-            params_grads = append_gradient_clip_ops(params_grads)
+            params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
         params_grads = self.append_regularization_ops(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
index a2be8260e39b69fe19b8f5932fb00e498459d30b..7b251e8063a05e7d4a09238feaf1efef04739fe4 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
@@ -150,26 +150,29 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
-    out.stop_gradient = False
-
     dx = paddle.grad(
-        outputs=[out], inputs=[t], create_graph=True, retain_graph=True
+        outputs=out,
+        inputs=t,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=True,
+        retain_graph=True,
     )
 
-    dx[0].backward()
-
-    assert dx[0].grad is not None
-    return dx[0].numpy(), dx[0].grad.numpy()
+    ddout = paddle.grad(
+        outputs=dx[0],
+        inputs=out.grad,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=False,
+    )
     fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
+    assert ddout[0].numpy() is not None
+    return dx[0].numpy(), ddout[0].numpy()
 
-class TestNewCustomOpSetUpInstall(unittest.TestCase):
+
+class TestNewCustomOpXpuSetUpInstall(unittest.TestCase):
     def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
-        # compile, install the custom op egg into site-packages under background
-        # Currently custom XPU op does not support Windows
-        if os.name == 'nt':
-            return
         cmd = 'cd {} && {} custom_relu_xpu_setup.py install'.format(
             cur_dir, sys.executable
         )
@@ -192,7 +195,7 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
         self.custom_op = custom_relu_xpu_module_setup.custom_relu
 
         self.dtypes = ['float32', 'float64']
-        self.devices = ['xpu']
+        self.device = 'xpu'
 
         # config seed
         SEED = 2021
@@ -200,91 +203,90 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
         paddle.framework.random._manual_program_seed(SEED)
 
     def test_static(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out = custom_relu_static(self.custom_op, device, dtype, x)
-                pd_out = custom_relu_static(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
 
     def test_static_pe(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out = custom_relu_static_pe(self.custom_op, device, dtype, x)
-                pd_out = custom_relu_static_pe(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static_pe(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static_pe(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_allclose(
+                out,
+                pd_out,
+                atol=1e-2,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
 
     def test_dynamic(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out, x_grad = custom_relu_dynamic(
-                    self.custom_op, device, dtype, x
-                )
-                pd_out, pd_x_grad = custom_relu_dynamic(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_array_equal(
-                    x_grad,
-                    pd_x_grad,
-                    err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                        x_grad, pd_x_grad
-                    ),
-                )
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                x_grad,
+                pd_x_grad,
+                err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
+                    x_grad, pd_x_grad
+                ),
+            )
 
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
         np_data = np.random.random((1, 1, 28, 28)).astype("float32")
         np_label = np.random.random((1, 1)).astype("int64")
         path_prefix = "self.custom_op_inference/custom_relu"
-        for device in self.devices:
-            predict = custom_relu_static_inference(
-                self.custom_op, device, np_data, np_label, path_prefix
+
+        predict = custom_relu_static_inference(
+            self.custom_op, self.device, np_data, np_label, path_prefix
+        )
+        # load inference model
+        with static.scope_guard(static.Scope()):
+            exe = static.Executor()
+            [
+                inference_program,
+                feed_target_names,
+                fetch_targets,
+            ] = static.load_inference_model(path_prefix, exe)
+            predict_infer = exe.run(
+                inference_program,
+                feed={feed_target_names[0]: np_data},
+                fetch_list=fetch_targets,
+            )
+            np.testing.assert_allclose(
+                predict,
+                predict_infer,
+                atol=1e-2,
+                err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
+                    predict, predict_infer
+                ),
             )
-            # load inference model
-            with static.scope_guard(static.Scope()):
-                exe = static.Executor()
-                [
-                    inference_program,
-                    feed_target_names,
-                    fetch_targets,
-                ] = static.load_inference_model(path_prefix, exe)
-                predict_infer = exe.run(
-                    inference_program,
-                    feed={feed_target_names[0]: np_data},
-                    fetch_list=fetch_targets,
-                )
-                np.testing.assert_array_equal(
-                    predict,
-                    predict_infer,
-                    err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
-                        predict, predict_infer
-                    ),
-                )
         paddle.disable_static()
 
     def test_static_save_and_run_inference_predictor(self):
@@ -294,92 +296,97 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
         path_prefix = "self.custom_op_inference/custom_relu"
         from paddle.inference import Config, create_predictor
 
-        for device in self.devices:
-            predict = custom_relu_static_inference(
-                self.custom_op, device, np_data, np_label, path_prefix
-            )
-            # load inference model
-            config = Config(
-                path_prefix + ".pdmodel", path_prefix + ".pdiparams"
+        predict = custom_relu_static_inference(
+            self.custom_op, self.device, np_data, np_label, path_prefix
+        )
+        # load inference model
+        config = Config(path_prefix + ".pdmodel", path_prefix + ".pdiparams")
+        predictor = create_predictor(config)
+        input_tensor = predictor.get_input_handle(
+            predictor.get_input_names()[0]
+        )
+        input_tensor.reshape(np_data.shape)
+        input_tensor.copy_from_cpu(np_data.copy())
+        predictor.run()
+        output_tensor = predictor.get_output_handle(
+            predictor.get_output_names()[0]
+        )
+        predict_infer = output_tensor.copy_to_cpu()
+        predict = np.array(predict).flatten()
+        predict_infer = np.array(predict_infer).flatten()
+        np.testing.assert_allclose(
+            predict,
+            predict_infer,
+            rtol=5e-5,
+            atol=1e-2,
+            err_msg="custom op predict: {},\n custom op infer predict: {}".format(
+                predict, predict_infer
+            ),
+        )
+        paddle.disable_static()
+
+    def test_func_double_grad_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x
             )
-            predictor = create_predictor(config)
-            input_tensor = predictor.get_input_handle(
-                predictor.get_input_names()[0]
+            pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x, False
             )
-            input_tensor.reshape(np_data.shape)
-            input_tensor.copy_from_cpu(np_data.copy())
-            predictor.run()
-            output_tensor = predictor.get_output_handle(
-                predictor.get_output_names()[0]
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
             )
-            predict_infer = output_tensor.copy_to_cpu()
-            self.assertTrue(
-                np.isclose(predict, predict_infer, rtol=5e-5).any(),
-                "custom op predict: {},\n custom op infer predict: {}".format(
-                    predict, predict_infer
+            np.testing.assert_array_equal(
+                dx_grad,
+                pd_dx_grad,
+                err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
+                    dx_grad, pd_dx_grad
                 ),
             )
-        paddle.disable_static()
-
-    def test_func_double_grad_dynamic(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out, dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_op, device, dtype, x
-                )
-                pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_array_equal(
-                    dx_grad,
-                    pd_dx_grad,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                        dx_grad, pd_dx_grad
-                    ),
-                )
 
     def test_with_dataloader(self):
         paddle.disable_static()
-        for device in self.devices:
-            paddle.set_device(device)
-            # data loader
-            transform = Compose(
-                [Normalize(mean=[127.5], std=[127.5], data_format='CHW')]
-            )
-            train_dataset = paddle.vision.datasets.MNIST(
-                mode='train', transform=transform
-            )
-            train_loader = paddle.io.DataLoader(
-                train_dataset,
-                batch_size=64,
-                shuffle=True,
-                drop_last=True,
-                num_workers=0,
-            )
+        paddle.set_device(self.device)
+        # data loader
+        transform = Compose(
+            [Normalize(mean=[127.5], std=[127.5], data_format='CHW')]
+        )
+        train_dataset = paddle.vision.datasets.MNIST(
+            mode='train', transform=transform
+        )
+        train_loader = paddle.io.DataLoader(
+            train_dataset,
+            batch_size=64,
+            shuffle=True,
+            drop_last=True,
+            num_workers=0,
+        )
 
-            for batch_id, (image, _) in enumerate(train_loader()):
-                out = self.custom_op(image)
-                pd_out = paddle.nn.functional.relu(image)
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+        for batch_id, (image, _) in enumerate(train_loader()):
+            out = self.custom_op(image)
+            pd_out = paddle.nn.functional.relu(image)
+            np.testing.assert_allclose(
+                out,
+                pd_out,
+                atol=1e-2,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
 
-                if batch_id == 5:
-                    break
+            if batch_id == 5:
+                break
         paddle.enable_static()
 
 
 if __name__ == '__main__':
+    # compile, install the custom op egg into site-packages under background
+    # Currently custom XPU op does not support Windows
+    if os.name == 'nt':
+        exit()
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
index 1dd6ef6776750c01fa78b6e6a269fea0df63f33d..00eef2d5a77316dcb3918ff32dde55b4fe9a1c73 100644
--- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -28,4 +28,5 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120)
   set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120)
   set_tests_properties(test_custom_cpu_to_static PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_custom_device_relu_setup PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da0563ffeb10e3762dc874676ffc9402d0529bc7
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_CUSTOM_INPUT(x) \
+  PD_CHECK(x.is_custom_device(), #x " must be a custom Tensor.")
+
+template <typename data_t>
+void relu_cpu_forward_kernel(const data_t* x_data,
+                             data_t* out_data,
+                             int64_t x_numel) {
+  PD_CHECK(x_data != nullptr, "x_data is nullptr.");
+  PD_CHECK(out_data != nullptr, "out_data is nullptr.");
+  for (int64_t i = 0; i < x_numel; ++i) {
+    out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_backward_kernel(const data_t* grad_out_data,
+                              const data_t* out_data,
+                              data_t* grad_x_data,
+                              int64_t out_numel) {
+  for (int64_t i = 0; i < out_numel; ++i) {
+    grad_x_data[i] =
+        grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_double_backward_kernel(const data_t* out_data,
+                                     const data_t* ddx_data,
+                                     data_t* ddout_data,
+                                     int64_t ddout_numel) {
+  for (int64_t i = 0; i < ddout_numel; ++i) {
+    ddout_data[i] =
+        ddx_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
+  CHECK_CPU_INPUT(x);
+  auto out = paddle::empty_like(x);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cpu_forward", ([&] {
+        relu_cpu_forward_kernel<data_t>(
+            x.data<data_t>(), out.data<data_t>(), x.numel());
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
+                                              const paddle::Tensor& out,
+                                              const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::empty_like(x);
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.data<data_t>(),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cpu_double_backward(
+    const paddle::Tensor& out, const paddle::Tensor& ddx) {
+  CHECK_CPU_INPUT(out);
+  CHECK_CPU_INPUT(ddx);
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] {
+                               relu_cpu_double_backward_kernel<data_t>(
+                                   out.data<data_t>(),
+                                   ddx.data<data_t>(),
+                                   ddout.mutable_data<data_t>(out.place()),
+                                   ddout.size());
+                             }));
+
+  return {ddout};
+}
+
+std::vector<paddle::Tensor> relu_custom_forward(const paddle::Tensor& x) {
+  CHECK_CUSTOM_INPUT(x);
+  auto out = paddle::relu(x);
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_custom_backward(
+    const paddle::Tensor& x,
+    const paddle::Tensor& out,
+    const paddle::Tensor& grad_out) {
+  CHECK_CUSTOM_INPUT(x);
+  CHECK_CUSTOM_INPUT(out);
+  auto grad_x = paddle::empty_like(x, x.dtype(), x.place());
+  auto ones = paddle::experimental::full_like(x, 1.0, x.dtype(), x.place());
+  auto zeros = paddle::experimental::full_like(x, 0.0, x.dtype(), x.place());
+  auto condition = paddle::experimental::greater_than(x, zeros);
+
+  grad_x = paddle::multiply(grad_out, paddle::where(condition, ones, zeros));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_custom_double_backward(
+    const paddle::Tensor& out, const paddle::Tensor& ddx) {
+  CHECK_CUSTOM_INPUT(out);
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
+  auto ones =
+      paddle::experimental::full_like(out, 1.0, out.dtype(), out.place());
+  auto zeros =
+      paddle::experimental::full_like(out, 0.0, out.dtype(), out.place());
+  auto condition = paddle::experimental::greater_than(out, zeros);
+
+  ddout = paddle::multiply(ddx, paddle::where(condition, ones, zeros));
+
+  return {ddout};
+}
+
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
+  if (x.is_cpu()) {
+    return relu_cpu_forward(x);
+  } else if (x.is_custom_device()) {
+    return relu_custom_forward(x);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& grad_out) {
+  if (x.is_cpu()) {
+    return relu_cpu_backward(x, out, grad_out);
+  } else if (x.is_custom_device()) {
+    return relu_custom_backward(x, out, grad_out);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
+                                               const paddle::Tensor& ddx) {
+  if (out.is_cpu()) {
+    return relu_cpu_double_backward(out, ddx);
+  } else if (out.is_custom_device()) {
+    return relu_custom_double_backward(out, ddx);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ReluDoubleBackwardInferShape(
+    const std::vector<int64_t>& out_shape,
+    const std::vector<int64_t>& ddx_shape) {
+  return {out_shape};
+}
+
+PD_BUILD_OP(custom_relu)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackward));
+
+PD_BUILD_DOUBLE_GRAD_OP(custom_relu)
+    .Inputs({"Out", paddle::Grad(paddle::Grad("X"))})
+    .Outputs({paddle::Grad(paddle::Grad("Out"))})
+    .SetKernelFn(PD_KERNEL(ReluDoubleBackward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ReluDoubleBackwardInferShape));
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..760ad56cc3380e4d5b53fd65e07638e14d5859f5
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+from site import getsitepackages
+
+import numpy as np
+
+
+def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
+    import paddle
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+    paddle.set_device(device)
+
+    t = paddle.to_tensor(np_x, dtype=dtype)
+    t.stop_gradient = False
+    sys.stdout.flush()
+
+    out = func(t) if use_func else paddle.nn.functional.relu(t)
+    out.stop_gradient = False
+
+    out.backward()
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
+    if t.grad is None:
+        return out.numpy(), t.grad
+    else:
+        return out.numpy(), t.grad.numpy()
+
+
+def custom_relu_static(func, device, dtype, np_x, use_func=True):
+    import paddle
+    import paddle.static as static
+
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="X", shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+            # in static mode, x data has been covered by out
+            out_v = exe.run(
+                static.default_main_program(),
+                feed={"X": np_x},
+                fetch_list=[out.name],
+            )
+
+    paddle.disable_static()
+    return out_v
+
+
+def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
+    import paddle
+    import paddle.static as static
+
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    places = paddle.CustomPlace("custom_cpu", 0)
+
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="X", shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            # in static mode, x data has been covered by out
+            compiled_prog = static.CompiledProgram(
+                static.default_main_program()
+            ).with_data_parallel(loss_name=out.name, places=places)
+            out_v = exe.run(
+                compiled_prog, feed={"X": np_x}, fetch_list=[out.name]
+            )
+
+    paddle.disable_static()
+    return out_v
+
+
+def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
+    import paddle
+
+    paddle.set_device(device)
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+    t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+
+    out = func(t) if use_func else paddle.nn.functional.relu(t)
+    dx = paddle.grad(
+        outputs=out,
+        inputs=t,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=True,
+        retain_graph=True,
+    )
+
+    ddout = paddle.grad(
+        outputs=dx[0],
+        inputs=out.grad,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=False,
+    )
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
+    assert ddout[0].numpy() is not None
+    return dx[0].numpy(), ddout[0].numpy()
+
+
+class TestNewCustomOpSetUpInstall(unittest.TestCase):
+    def setUp(self):
+        # compile so and set to current path
+        self.cur_dir = os.path.dirname(os.path.abspath(__file__))
+        self.temp_dir = tempfile.TemporaryDirectory()
+        cmd = 'cd {} \
+            && git clone {} \
+            && cd PaddleCustomDevice \
+            && git fetch origin \
+            && git checkout {} -b dev \
+            && cd backends/custom_cpu \
+            && mkdir build && cd build && cmake .. && make -j8 \
+            && cd {}'.format(
+            self.temp_dir.name,
+            os.getenv('PLUGIN_URL'),
+            os.getenv('PLUGIN_TAG'),
+            self.cur_dir,
+        )
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
+            self.cur_dir,
+            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
+                self.temp_dir.name
+            ),
+        )
+
+        # `import paddle` loads custom_cpu.so, hence we must import paddle after finishing build PaddleCustomDevice
+        import paddle
+
+        # [Why specific paddle_includes directory?]
+        # Add paddle_includes to pass CI, for more details,
+        # please refer to the comments in `paddle/fluid/tests/custom_op/utils.py``
+        paddle_includes = []
+        for site_packages_path in getsitepackages():
+            paddle_includes.append(
+                os.path.join(site_packages_path, 'paddle', 'include')
+            )
+            paddle_includes.append(
+                os.path.join(
+                    site_packages_path, 'paddle', 'include', 'third_party'
+                )
+            )
+
+        custom_module = paddle.utils.cpp_extension.load(
+            name='custom_device_relu',
+            sources=['custom_relu_op.cc'],
+            extra_include_paths=paddle_includes,  # add for Coverage CI
+            extra_cxx_cflags=["-w", "-g"],  # test for cc flags
+            # build_directory=self.cur_dir,
+            verbose=True,
+        )
+        self.custom_op = custom_module.custom_relu
+
+        self.dtypes = ["float32", "float64"]
+        self.device = "custom_cpu"
+
+        # config seed
+        SEED = 2021
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+    def test_custom_device(self):
+        self._test_static()
+        self._test_static_pe()
+        self._test_dynamic()
+        self._test_double_grad_dynamic()
+        self._test_with_dataloader()
+
+    def _test_static(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+    def _test_static_pe(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static_pe(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static_pe(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+    def _test_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                x_grad,
+                pd_x_grad,
+                err_msg="custom op x grad: {},\n paddle api x grad: {}".format(
+                    x_grad, pd_x_grad
+                ),
+            )
+
+    def _test_double_grad_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                dx_grad,
+                pd_dx_grad,
+                err_msg="custom op dx grad: {},\n paddle api dx grad: {}".format(
+                    dx_grad, pd_dx_grad
+                ),
+            )
+
+    def _test_with_dataloader(self):
+        import paddle
+        from paddle.vision.transforms import Compose, Normalize
+
+        paddle.set_device(self.device)
+        # data loader
+        transform = Compose(
+            [Normalize(mean=[127.5], std=[127.5], data_format="CHW")]
+        )
+        train_dataset = paddle.vision.datasets.MNIST(
+            mode="train", transform=transform
+        )
+        train_loader = paddle.io.DataLoader(
+            train_dataset,
+            batch_size=64,
+            shuffle=True,
+            drop_last=True,
+            num_workers=0,
+        )
+
+        for batch_id, (image, _) in enumerate(train_loader()):
+            out = self.custom_op(image)
+            pd_out = paddle.nn.functional.relu(image)
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+            if batch_id == 5:
+                break
+
+
+if __name__ == "__main__":
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index dcfe477a76b3e24ec8df2e02d3fe07121f16d9cf..65483d1c6adf68dba55e43180e9993d712193811 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -38,13 +38,13 @@ with fluid.program_guard(main_program=prog):
 
 prog_clip = prog.clone()
 prog_clip.block(0).var(hidden1.name)._set_error_clip(
-    fluid.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
+    paddle.nn.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
 )
 
 avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
 fluid.backward.append_backward(loss=avg_cost)
 fluid.backward.append_backward(
-    loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback]
+    loss=avg_cost_clip, callbacks=[paddle.nn.clip.error_clip_callback]
 )
 
 hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
index f974709ce87abec8679b3846746bbe087e495778..f97faed1d584fce94d8715323e525fea7ac57d49 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
 
         opt = paddle.optimizer.AdamW(
             learning_rate=lr_val,
-            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
         )
 
         acc_steps = 2  # accumulated steps for pipeline
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
index ecc71abe6252cd864f997a0059837efc73a66990..170243fc962839f063a0aafc39adef62fc0d4737 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
         opt = fluid.optimizer.Momentum(
             learning_rate=lr_val,
             momentum=0.9,
-            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
         )
 
         acc_steps = 2  # accumulated steps for pipeline
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
index c7b44fe305d25aa2cac4fd5f4f8ffda56b479940..0d499393f12155aa1d0b73af9f45e2f98a0d2f56 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
@@ -15,10 +15,10 @@
 import unittest
 
 import paddle
-import paddle.fluid.clip as clip
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
+import paddle.nn.clip as clip
 
 paddle.enable_static()
 
@@ -76,7 +76,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
             rampup_begin_step=0,
             num_trainers=2,
             regularization=regularization,
-            grad_clip=clip.GradientClipByNorm(1.0),
+            grad_clip=clip.ClipGradByNorm(1.0),
         )
 
         if use_recompute:
@@ -144,14 +144,14 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
                 print("dgc regular_coeff=" + str(coeff))
 
     def test_tpyeError(self):
-        # the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
+        # the type of DGCMomentumOptimizer(grad_clip=) must be 'ClipGradByNorm'
         with self.assertRaises(TypeError):
             dgc_momentum_optimizer = self.MockDGCMomentum(
                 learning_rate=0.01,
                 momentum=0.2,
                 rampup_begin_step=0,
                 num_trainers=2,
-                grad_clip=clip.GradientClipByGlobalNorm(1.0),
+                grad_clip=clip.ClipGradByGlobalNorm(1.0),
             )
 
     def test_momentum_without_dgc(self):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
index eee1235670805f8d66b8206bbdd954129adfba97..0982ab86117c9f1302bb604737ec143902963725 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
@@ -354,7 +354,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -552,7 +552,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
         strategy.fuse_grad_merge = True
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -940,7 +940,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -1044,7 +1044,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
index d59c074c03f11dd5ce9acc635216a417e7437f07..46b5fe9ed4b6a641d21d42a0cf1d730314f5a964 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
@@ -640,7 +640,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
         )
         avg_cost, strategy = self.net(train_prog, startup_prog)
         self.set_strategy(strategy, 'sharding')
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
         )
@@ -1309,7 +1309,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
             "micro_batch_size": 2,
             "accumulate_steps": 4,
         }
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
         )
@@ -1547,7 +1547,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
             "micro_batch_size": 2,
             "accumulate_steps": 4,
         }
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost,
             strategy,
diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
index 130510d90cb045fb77c11f703200df762a9232c9..3be3cfecf16d6ef3e19ef989b1065d592529eb90 100644
--- a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
@@ -488,6 +488,9 @@ class TestProcessGroupFp32(unittest.TestCase):
             task.wait()
 
         print("test reduce prod api ok")
+
+        test_reduce_with_zero_dim([], self.dtype, pg)
+
         # test Scatter
         # rank 0
         in_shape = list(self.shape)
@@ -601,5 +604,88 @@ class TestProcessGroupFp16(TestProcessGroupFp32):
         self.shape = (4, 20, 20)
 
 
+def test_reduce_with_zero_dim(shape, dtype, pg):
+    # test Reduce With Zero Dim
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    y = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    tensor_y = paddle.to_tensor(y)
+    sum_result = tensor_x + tensor_y
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, sync_op=True)
+        paddle.device.cuda.synchronize()
+    # rank 1
+    else:
+        task = dist.reduce(tensor_y, 0, sync_op=False)
+        task.wait()
+        paddle.device.cuda.synchronize()
+    if pg.rank() == 0:
+        assert np.array_equal(tensor_x, sum_result) and len(tensor_x.shape) == 0
+    print("test reduce with zero dim sum api ok\n")
+
+    # test reduce with zero dim max
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    max_result = paddle.maximum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, dist.ReduceOp.MAX, sync_op=False)
+        task.wait()
+        assert np.array_equal(tensor_x, max_result) and len(tensor_x.shape) == 0
+    else:
+        task = dist.reduce(tensor_y, 0, dist.ReduceOp.MAX, sync_op=False)
+        task.wait()
+
+    print("test reduce with zero dim max api ok")
+
+    # test reduce with zero dim min
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    min_result = paddle.minimum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, dist.ReduceOp.MIN, sync_op=False)
+        task.wait()
+        assert np.array_equal(tensor_x, min_result) and len(tensor_x.shape) == 0
+    else:
+        task = dist.reduce(tensor_y, 0, dist.ReduceOp.MIN, sync_op=False)
+        task.wait()
+
+    print("test reduce with zero dim min api ok")
+
+    # test reduce with zero dim product
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    prod_result = np.multiply(x, y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, dist.ReduceOp.PROD, sync_op=False)
+        task.wait()
+        assert (
+            np.array_equal(tensor_x, prod_result) and len(tensor_x.shape) == 0
+        )
+    else:
+        task = dist.reduce(tensor_y, 0, dist.ReduceOp.PROD, sync_op=False)
+        task.wait()
+
+    print("test reduce with zero dim prod api ok")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index ff9122b1191b64e36ddb40c93f9770d0d5135646..3fa9c12529272c495644508e947d63c6a3f973b2 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -22,8 +22,8 @@ import paddle
 import paddle.distributed.fleet as fleet
 import paddle.fluid.core as core
 from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
-from paddle.fluid.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
 from paddle.incubate import DistributedFusedLamb
+from paddle.nn.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
 from paddle.vision.models import resnet18 as resnet
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
index de3508afcbe2bca43c4a5e762f4519a7e2e4c714..218e3ed4326ad5c0e9282b4dc0026464304ab363 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
@@ -19,6 +19,7 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.jit.dy2static import Call
+from paddle.nn import clip
 
 SEED = 2020
 np.random.seed(SEED)
@@ -89,11 +90,11 @@ def len_with_selected_rows(place):
         type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
     )
     # y is Variable(SelectedRows)
-    y = fluid.layers.merge_selected_rows(var)
+    y = clip.merge_selected_rows(var)
     y_len = Call(len)(y)
 
     # z is inner tensor with shape [4, 2]
-    z = fluid.layers.get_tensor_from_selected_rows(y)
+    z = clip.get_tensor_from_selected_rows(y)
     z_len = Call(len)(z)
 
     # set data for selected_rows
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
index fabfa8edc3c83aa85a2d9c60bcc6801b5c9a39bd..5c84da8e621be91b434f2926b236e17363f00b30 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
@@ -441,5 +441,39 @@ class TestErrorInForLoop(TestTransformForLoop):
         self.dyfunc = for_loop_dyfunc_not_support
 
 
+class Net(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+        self.layer_dict = paddle.nn.LayerDict(
+            {
+                "conv1": paddle.nn.Conv2D(3, 3, 1),
+                "conv2": paddle.nn.Conv2D(3, 3, 1),
+                "conv3": paddle.nn.Conv2D(3, 3, 1),
+            }
+        )
+
+    def forward(self, x):
+        out = 0
+        for layer_name in self.layer_dict:
+            out += self.layer_dict[layer_name](x)
+        return out
+
+
+class TestForLoopMeetDict(unittest.TestCase):
+    def test_start(self):
+
+        net = Net()
+        model = paddle.jit.to_static(
+            net,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 3, 224, 224], dtype='float32'
+                )
+            ],
+        )
+        paddle.jit.save(model, "./inference/inference")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
index a0a45ddbde2bea07d67b7d40299e695b7ce11ff5..64d0d816ba0a5bf0a2e54d5096aeafb2f900f999 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
@@ -22,8 +22,8 @@ from seq2seq_dygraph_model import AttentionModel, BaseModel
 from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter
 
 import paddle.fluid as fluid
-from paddle.fluid.clip import GradientClipByGlobalNorm
 from paddle.jit import ProgramTranslator
+from paddle.nn import ClipGradByGlobalNorm
 
 place = (
     fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
@@ -71,7 +71,7 @@ def train(args, attn_model=False):
                 dropout=args.dropout,
             )
 
-        gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm)
+        gloabl_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm)
         optimizer = fluid.optimizer.SGD(
             args.learning_rate,
             parameter_list=model.parameters(),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 05d26dde6eddfafa703268cd8904a857487cd71d..cfc83bbcb52047bad575bd0a9911f274d68cadb2 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -94,6 +94,19 @@ if(WITH_MKLDNN)
   endforeach()
 endif()
 
+# below are cutlass unitests
+file(
+  GLOB TEST_CUTLASS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_cutlass_*.py")
+string(REPLACE ".py" "" TEST_CUTLASS "${TEST_CUTLASS}")
+list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_CUTLASS})
+if(WITH_CUTLASS)
+  foreach(target ${TEST_CUTLASS})
+    py_test_modules(${target} MODULES ${target})
+  endforeach()
+endif()
+
 if(WITH_MKLDNN
    AND TENSORRT_FOUND
    AND WITH_GPU)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index b561822f1af92f652f7c8a9851b2d2eee34330df..99450cae46f516ef5af647b667b77789cabd899d 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -74,6 +74,8 @@ class IgnoreReasons(enum.Enum):
     PASS_ACCURACY_ERROR = 2
     # Accuracy is abnormal after enabling mkldnn.
     MKLDNN_ACCURACY_ERROR = 3
+    # Accuracy is abnormal after enabling cutlass.
+    CUTLASS_ACCURACY_ERROR = 3
 
 
 # TODO(wilber): just for backward compatible
@@ -877,3 +879,96 @@ class TrtLayerAutoScanTest(AutoScanTest):
         note: str,
     ):
         self.ignore_cases.append((teller, reason, note))
+
+
+class CutlassAutoScanTest(AutoScanTest):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def run_test(self, quant=False, *args, **kwargs):
+        status = True
+
+        for prog_config in self.sample_program_configs(*args, **kwargs):
+            # if program is invalid, we should skip that cases.
+            if not self.is_program_valid(prog_config):
+                continue
+
+            model, params = create_fake_model(prog_config)
+            feed_data = {}
+            for name, tensor_config in prog_config.inputs.items():
+                feed_data[name] = {
+                    'data': tensor_config.data,
+                    'lod': tensor_config.lod,
+                }
+            results: List[Dict[str, np.ndarray]] = []
+
+            # baseline: gpu no ir_optim run
+            base_config = self.create_inference_config(
+                ir_optim=False, use_gpu=True
+            )
+            logging.info('RUN program_config: ' + str(prog_config))
+            results.append(
+                self.run_test_config(
+                    model, params, prog_config, base_config, feed_data
+                )
+            )
+            self.success_log('RUN_GPU_BASELINE done')
+
+            for pred_config, (atol, rtol) in self.sample_predictor_configs(
+                prog_config
+            ):
+                # skip info
+                ignore_flag = False
+                for ignore_info in self.ignore_cases:
+                    if ignore_info[0](prog_config, pred_config):
+                        ignore_flag = True
+                        if (
+                            ignore_info[1]
+                            == IgnoreReasons.CUTLASS_ACCURACY_ERROR
+                        ):
+                            self.ignore_log(
+                                "[CUTLASS_ACCURACY_ERROR] "
+                                + ignore_info[2]
+                                + ' '
+                                + ' vs '
+                                + self.inference_config_str(pred_config)
+                            )
+                        else:
+                            raise NotImplementedError
+                        break
+
+                if os.path.exists(self.cache_dir):
+                    shutil.rmtree(self.cache_dir)
+                if not os.path.exists(self.cache_dir):
+                    os.mkdir(self.cache_dir)
+
+                try:
+                    results.append(
+                        self.run_test_config(
+                            model, params, prog_config, pred_config, feed_data
+                        )
+                    )
+                    self.assert_tensors_near(
+                        atol, rtol, results[-1], results[0]
+                    )
+                except Exception as e:
+                    self.fail_log(
+                        self.inference_config_str(pred_config)
+                        + '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e))
+                    )
+                    if not ignore_flag:
+                        status = False
+                    continue
+                self.success_log(
+                    'RUN predictor_config '
+                    + self.inference_config_str(pred_config)
+                    + ' done'
+                )
+
+        self.assertTrue(status)
+
+    def inference_config_str(self, config) -> str:
+        dic = {}
+        enable_gpu = config.use_gpu()
+        dic['use_gpu'] = enable_gpu
+        return str(dic)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_cutlass_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_cutlass_conv2d_fusion_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8adeff0f73ddf96ee78ff3d0631547e7259491c8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_cutlass_conv2d_fusion_op.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from itertools import product
+
+import numpy as np
+from auto_scan_test import CutlassAutoScanTest
+from program_config import ProgramConfig, TensorConfig
+
+import paddle.inference as paddle_infer
+
+
+# cba pattern
+class TestCutlassConv2dFusionOp1(CutlassAutoScanTest):
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input1(input_shape):
+            return np.random.random(input_shape).astype(np.float32)
+
+        def generate_weight(weight_shape):
+            return np.random.random(weight_shape).astype(np.float32)
+
+        def generate_bias(bias_shape):
+            return np.random.random(bias_shape).astype(np.float32)
+
+        input_shape_options = [[1, 16, 112, 112], [1, 8, 64, 64]]
+        weight_shape_options = [[24, -1, 3, 3]]
+        strides_options = [[1, 1], [2, 2]]
+        paddings_options = [[1, 1], [1, 0, 1, 2]]
+        groups_options = [1]
+        padding_algorithm_options = ['EXPLICIT']
+        dilations_options = [[2, 2], [1, 1]]
+        data_format_options = ['NCHW']
+        act_options = ['relu', 'leaky_relu', 'swish']
+
+        configurations = [
+            input_shape_options,
+            weight_shape_options,
+            strides_options,
+            paddings_options,
+            groups_options,
+            padding_algorithm_options,
+            dilations_options,
+            data_format_options,
+            act_options,
+        ]
+
+        for (
+            input_shape,
+            weight_shape,
+            strides,
+            paddings,
+            groups,
+            padding_algorithm,
+            dilations,
+            data_format,
+            act,
+        ) in product(*configurations):
+
+            weight_shape[1] = input_shape[1]
+            attrs = [
+                {
+                    "strides": strides,
+                    "paddings": paddings,
+                    "groups": groups,
+                    "padding_algorithm": padding_algorithm,
+                    "dilations": dilations,
+                    "data_format": data_format,
+                },
+                {"axis": 1},
+            ]
+
+            ops_config = [
+                {
+                    "op_type": "conv2d",
+                    "op_inputs": {
+                        "Input": ["input_data"],
+                        "Filter": ["conv2d_weight"],
+                    },
+                    "op_outputs": {"Output": ["conv_output_data"]},
+                    "op_attrs": attrs[0],
+                },
+                {
+                    "op_type": "elementwise_add",
+                    "op_inputs": {
+                        "X": ["conv_output_data"],
+                        "Y": ["elementwise_weight"],
+                    },
+                    "op_outputs": {"Out": ["output_data0"]},
+                    "op_attrs": attrs[1],
+                },
+                {
+                    "op_type": act,
+                    "op_inputs": {"X": ["output_data0"]},
+                    "op_outputs": {"Out": ["output_data1"]},
+                    "op_attrs": {},
+                },
+            ]
+
+            ops = self.generate_op_config(ops_config)
+
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={
+                    "conv2d_weight": TensorConfig(
+                        data_gen=partial(generate_weight, weight_shape)
+                    ),
+                    "elementwise_weight": TensorConfig(
+                        data_gen=partial(generate_bias, [weight_shape[0]])
+                    ),
+                },
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1, input_shape)
+                    )
+                },
+                outputs=["output_data1"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=True)
+        config.enable_use_gpu(256, 0, paddle_infer.PrecisionType.Half)
+        config.exp_enable_use_cutlass()
+        yield config, (1e-2, 1e-2)
+
+    def test(self, *args, **kwargs):
+        self.run_test(quant=False, *args, **kwargs)
+
+
+# cbaa pattern
+class TestCutlassConv2dFusionOp2(CutlassAutoScanTest):
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(input_shape):
+            return (np.random.random(input_shape) * 2 - 1).astype(np.float32)
+
+        def generate_weight(weight_shape):
+            return (np.random.random(weight_shape) * 2 - 1).astype(np.float32)
+
+        def generate_bias(bias_shape):
+            return np.random.random(bias_shape).astype(np.float32)
+
+        input_shape_options = [[1, 16, 112, 112], [1, 24, 64, 64]]
+        weight_shape_options = [[24, -1, 3, 3]]
+        strides_options = [[2, 2], [1, 1]]
+        paddings_options = [[1, 1]]
+        groups_options = [1]
+        padding_algorithm_options = ['EXPLICIT']
+        dilations_options = [[1, 1]]
+        data_format_options = ['NCHW']
+        act_options = ['relu']
+
+        configurations = [
+            input_shape_options,
+            weight_shape_options,
+            strides_options,
+            paddings_options,
+            groups_options,
+            padding_algorithm_options,
+            dilations_options,
+            data_format_options,
+            act_options,
+        ]
+
+        for (
+            input_shape,
+            weight_shape,
+            strides,
+            paddings,
+            groups,
+            padding_algorithm,
+            dilations,
+            data_format,
+            act,
+        ) in product(*configurations):
+            weight_shape[1] = input_shape[1]
+            residual_shape = list(input_shape)
+            residual_shape[1] = weight_shape[0]
+
+            ih = input_shape[2]
+            iw = input_shape[3]
+            pad_h0 = 0
+            pad_h1 = 0
+            pad_w0 = 0
+            pad_w1 = 0
+            if len(paddings) == 2:
+                pad_h0 = paddings[0]
+                pad_h1 = paddings[0]
+                pad_w0 = paddings[1]
+                pad_w1 = paddings[1]
+            elif len(paddings) == 4:
+                pad_h0 = paddings[0]
+                pad_h1 = paddings[1]
+                pad_w0 = paddings[2]
+                pad_w1 = paddings[3]
+            dilation_h = dilations[0]
+            dilation_w = dilations[1]
+            kh = weight_shape[2]
+            kw = weight_shape[3]
+            stride_h = strides[0]
+            stride_w = strides[1]
+            residual_shape[2] = (int)(
+                (ih + pad_h0 + pad_h1 - dilation_h * (kh - 1) - 1) / stride_h
+            ) + 1
+            residual_shape[3] = (int)(
+                (iw + pad_w0 + pad_w1 - dilation_w * (kw - 1) - 1) / stride_w
+            ) + 1
+
+            attrs = [
+                {
+                    "strides": strides,
+                    "paddings": paddings,
+                    "groups": groups,
+                    "padding_algorithm": padding_algorithm,
+                    "dilations": dilations,
+                    "data_format": data_format,
+                },
+                {"axis": 1},
+            ]
+
+            ops_config = [
+                {
+                    "op_type": "conv2d",
+                    "op_inputs": {
+                        "Input": ["input_data"],
+                        "Filter": ["conv2d_weight"],
+                    },
+                    "op_outputs": {"Output": ["conv_output_data"]},
+                    "op_attrs": attrs[0],
+                },
+                {
+                    "op_type": "elementwise_add",
+                    "op_inputs": {
+                        "X": ["conv_output_data"],
+                        "Y": ["elementwise_weight"],
+                    },
+                    "op_outputs": {"Out": ["output_data0"]},
+                    "op_attrs": attrs[1],
+                },
+                {
+                    "op_type": "elementwise_add",
+                    "op_inputs": {
+                        "X": ["residual_data"],
+                        "Y": ["output_data0"],
+                    },
+                    "op_outputs": {"Out": ["output_data1"]},
+                    "op_attrs": {},
+                },
+                {
+                    "op_type": act,
+                    "op_inputs": {"X": ["output_data1"]},
+                    "op_outputs": {"Out": ["output_data2"]},
+                    "op_attrs": {},
+                },
+            ]
+
+            ops = self.generate_op_config(ops_config)
+
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={
+                    "conv2d_weight": TensorConfig(
+                        data_gen=partial(generate_weight, weight_shape)
+                    ),
+                    "elementwise_weight": TensorConfig(
+                        data_gen=partial(generate_bias, [weight_shape[0]])
+                    ),
+                },
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input, input_shape)
+                    ),
+                    "residual_data": TensorConfig(
+                        data_gen=partial(generate_input, residual_shape)
+                    ),
+                },
+                outputs=["output_data2"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=True)
+        config.enable_use_gpu(256, 0, paddle_infer.PrecisionType.Half)
+        config.exp_enable_use_cutlass()
+        yield config, (1e-2, 1e-2)
+
+    def test(self, *args, **kwargs):
+        self.run_test(quant=False, *args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
index b1890ea95ab9782187c66d58027422e7481b0602..24a63751cfec431d4335baa793543da3ba48d83d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
 ):
     def set_params(self):
         self.operand = paddle.add
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
@@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
 ):
     def set_params(self):
         self.operand = paddle.subtract
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
@@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
 ):
     def set_params(self):
         self.operand = paddle.multiply
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
index d696fa44f5aaa081b0587c2049c68c90d55637d3..5c1a11625611ba67b82c3d462dcc87d1d0998708 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
@@ -37,74 +37,55 @@ class TestScaleMatmulMkldnnFusePass(PassAutoScanTest):
         input_dim = draw(st.sampled_from([1, 32, 64]))
 
         def generate_input(attrs, type):
-            if attrs[1]['transpose_X'] and attrs[1]['transpose_Y']:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    32,
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    64,
-                    attrs[2]['input_dim'],
-                ]
-            elif attrs[1]['transpose_X']:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    32,
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    64,
-                ]
-            elif attrs[1]['transpose_Y']:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    32,
-                    attrs[2]['input_dim'],
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    8,
-                    attrs[2]['input_dim'],
-                ]
+            is_transpose_X = attrs[1]['transpose_X']
+            is_transpose_Y = attrs[1]['transpose_Y']
+
+            if is_transpose_X:
+                shape_x_3 = attrs[2]['input_dim']
+                shape_x_4 = 32
             else:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    32,
-                    attrs[2]['input_dim'],
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    16,
-                ]
-
-            if type == "x":
-                return np.random.random(shape_x).astype(np.float32)
+                shape_x_3 = 32
+                shape_x_4 = attrs[2]['input_dim']
+
+            if is_transpose_X and is_transpose_Y:
+                shape_y_3 = 64
+                shape_y_4 = attrs[2]['input_dim']
+            elif is_transpose_X:
+                shape_y_3 = attrs[2]['input_dim']
+                shape_y_4 = 64
+            elif is_transpose_Y:
+                shape_y_3 = 8
+                shape_y_4 = attrs[2]['input_dim']
             else:
-                return np.random.random(shape_y).astype(np.float32)
+                shape_y_3 = attrs[2]['input_dim']
+                shape_y_4 = 16
+
+            shape_x = [
+                attrs[2]['batch_size'],
+                attrs[2]['channel'],
+                shape_x_3,
+                shape_x_4,
+            ]
+            shape_y = [
+                attrs[2]['batch_size'],
+                attrs[2]['channel'],
+                shape_y_3,
+                shape_y_4,
+            ]
+
+            shape = shape_x if type == 'x' else shape_y
+            return np.random.random(shape).astype(np.float32)
 
         attrs = [
             {
-                "scale": scale,
-                "bias": bias,
-                "bias_after_scale": bias_after_scale,
+                'scale': scale,
+                'bias': bias,
+                'bias_after_scale': bias_after_scale,
             },
             {
-                "transpose_X": transpose_X,
-                "transpose_Y": transpose_Y,
-                "alpha": alpha,
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+                'alpha': alpha,
             },
             {
                 'batch_size': batch_size,
@@ -115,29 +96,29 @@ class TestScaleMatmulMkldnnFusePass(PassAutoScanTest):
 
         ops_config = [
             {
-                "op_type": "scale",
-                "op_inputs": {"X": ["input_data1"]},
-                "op_outputs": {"Out": ["scale_output"]},
-                "op_attrs": {
-                    "scale": attrs[0]['scale'],
-                    "bias": attrs[0]['bias'],
-                    "bias_after_scale": attrs[0]['bias_after_scale'],
+                'op_type': 'scale',
+                'op_inputs': {'X': ['input_data1']},
+                'op_outputs': {'Out': ['scale_output']},
+                'op_attrs': {
+                    'scale': attrs[0]['scale'],
+                    'bias': attrs[0]['bias'],
+                    'bias_after_scale': attrs[0]['bias_after_scale'],
                 },
             },
             {
-                "op_type": "matmul",
-                "op_inputs": {"X": ["scale_output"], "Y": ["input_data2"]},
-                "op_outputs": {"Out": ["matmul_output"]},
-                "op_attrs": {
+                'op_type': 'matmul',
+                'op_inputs': {'X': ['scale_output'], 'Y': ['input_data2']},
+                'op_outputs': {'Out': ['matmul_output']},
+                'op_attrs': {
                     'transpose_X': attrs[1]['transpose_X'],
                     'transpose_Y': attrs[1]['transpose_Y'],
                     'alpha': attrs[1]['alpha'],
-                    "fused_reshape_X": [],
-                    "fused_reshape_Y": [],
-                    "fused_transpose_X": [],
-                    "fused_transpose_Y": [],
-                    "fused_reshape_Out": [],
-                    "fused_transpose_Out": [],
+                    'fused_reshape_X': [],
+                    'fused_reshape_Y': [],
+                    'fused_transpose_X': [],
+                    'fused_transpose_Y': [],
+                    'fused_reshape_Out': [],
+                    'fused_transpose_Out': [],
                 },
             },
         ]
@@ -148,25 +129,27 @@ class TestScaleMatmulMkldnnFusePass(PassAutoScanTest):
             ops=ops,
             weights={},
             inputs={
-                "input_data1": TensorConfig(
-                    data_gen=partial(generate_input, attrs, "x")
+                'input_data1': TensorConfig(
+                    data_gen=partial(generate_input, attrs, 'x')
                 ),
-                "input_data2": TensorConfig(
-                    data_gen=partial(generate_input, attrs, "y")
+                'input_data2': TensorConfig(
+                    data_gen=partial(generate_input, attrs, 'y')
                 ),
             },
-            outputs=["matmul_output"],
+            outputs=['matmul_output'],
         )
 
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(
+            use_mkldnn=True, passes=['scale_matmul_fuse_pass']
+        )
         yield config, ['matmul'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(quant=False, passes=["scale_matmul_fuse_pass"])
+        self.run_and_statis(quant=False, passes=['scale_matmul_fuse_pass'])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index 3a39c84141ced2c0f0538350b3d70c7d9bcaf9c3..0c205fbee7c87079035221e457663c24b0234ced 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -106,7 +106,7 @@ class TensorRTSubgraphPassHardSwishPluginTest(
 
 class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
     def append_act(self, x):
-        return fluid.layers.clip(x, 0, 1)
+        return paddle.clip(x, 0, 1)
 
 
 class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
index 35780b491cc7633e84427b457456202f1bc245e4..ce46c79cbbd3dd5ee600c47874f5d1e1c3ee1bee 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
@@ -39,45 +39,46 @@ class TrtConvertElementwiseTest_one_input_corner_case(TrtLayerAutoScanTest):
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
-        for batch in [1, 2, 4]:
-            for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]:
-                for axis in [-1 if len(shape) == 1 else 1]:
-                    self.dims = len(shape)
-                    dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}]
-                    ops_config = [
-                        {
-                            "op_type": "equal",
-                            "op_inputs": {
-                                "X": ["input_data1"],
-                                "Y": ["input_data2"],
+        for op_type in ["equal", "not_equal"]:
+            for batch in [1, 2, 4]:
+                for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]:
+                    for axis in [-1 if len(shape) == 1 else 1]:
+                        self.dims = len(shape)
+                        dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}]
+                        ops_config = [
+                            {
+                                "op_type": op_type,
+                                "op_inputs": {
+                                    "X": ["input_data1"],
+                                    "Y": ["input_data2"],
+                                },
+                                "op_outputs": {"Out": ["compare_output_data"]},
+                                "op_attrs": dics[0],
                             },
-                            "op_outputs": {"Out": ["compare_output_data"]},
-                            "op_attrs": dics[0],
-                        },
-                        {
-                            "op_type": "cast",
-                            "op_inputs": {"X": ["compare_output_data"]},
-                            "op_outputs": {"Out": ["output_data"]},
-                            "op_attrs": dics[1],
-                        },
-                    ]
-                    ops = self.generate_op_config(ops_config)
-
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data1": TensorConfig(
-                                data_gen=partial(generate_input, shape)
-                            ),
-                            "input_data2": TensorConfig(
-                                data_gen=partial(generate_input, shape)
-                            ),
-                        },
-                        outputs=["output_data"],
-                    )
-
-                    yield program_config
+                            {
+                                "op_type": "cast",
+                                "op_inputs": {"X": ["compare_output_data"]},
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[1],
+                            },
+                        ]
+                        ops = self.generate_op_config(ops_config)
+
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input_data1": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                ),
+                                "input_data2": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                ),
+                            },
+                            outputs=["output_data"],
+                        )
+
+                        yield program_config
 
     def sample_predictor_configs(
         self, program_config
diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
index 8b13546d9a2852009dfba4744b5bdfaaac07d3d0..122429a7f8454cd687b90b3e503b531727d478f0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
@@ -117,13 +117,13 @@ class TestClipOpError(unittest.TestCase):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_dtype():
                 x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+                paddle.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 912f52969d712e1a03da97b9a9d119ab99161b22..d0e6c98e25a422c8eeeccb1feb1544b144152316 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -686,7 +686,7 @@ class TestAdamOpV2(unittest.TestCase):
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = fluid.dygraph.to_variable(value)
         linear = paddle.nn.Linear(13, 5)
-        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         adam = paddle.optimizer.Adam(
             0.1, parameters=linear.parameters(), grad_clip=clip
         )
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index a9d79f81bf310b9a1d94202c655571c948857909..ce3dd7509ce1d8cfddfc06af95a7f2d2358c8b5c 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -20,12 +20,13 @@ from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.nn import clip
 
 
 class TestClipByNormOp(OpTest):
     def setUp(self):
         self.max_relative_error = 0.006
-        self.python_api = fluid.layers.clip_by_norm
+        self.python_api = clip.clip_by_norm
         self.init_dtype()
         self.initTestCase()
         input = np.random.random(self.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index a25edccb97a4edca00c4f24e4cd020c11062c449..359220a7a601f131f89e68c6da8b424d20070c3d 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -128,15 +128,9 @@ class TestClipOpError(unittest.TestCase):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
-
-            def test_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
-
-            self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index d5ad18fc434cbe9075604b9bef0798afeaa0c8a6..c6bdd59d496634744da2673d7f2ca8b103346376 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -584,7 +584,7 @@ class TestL2Decay(TranspilerTest):
         def filter(param):
             return param.name == "fc_w"
 
-        clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
+        clip = paddle.nn.ClipGradByValue(0.1, need_clip=filter)
         sgd_optimizer.minimize(avg_cost, grad_clip=clip)
 
     def transpiler_test_impl(self):
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index d0256b5dfb8994c3ee27fb0c2c29ab3bd136d4ac..80bc977f091bac9e57c5e4774e5236a96115c22c 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -504,8 +504,8 @@ class PaddingRNNTestBase(unittest.TestCase):
                     self.feed_order,
                 ) = res_vars
 
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByGlobalNorm(
+                paddle.nn.clip.set_gradient_clip(
+                    clip=paddle.nn.ClipGradByGlobalNorm(
                         clip_norm=config.max_grad_norm
                     )
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index e81fc34ea2ca0fb8eea864a51791bf7b13a5abc0..400009f820de3c59cafb87582ca43c77dc7ae176 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
index 0de28e9839efa344244eeba0e60ad93afdca0291..d24348b7d77b58234f2dbc1ef9d7ae7d563a19d3 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
index 36a85e2d74fc7af46061dc3ccef0e1255cdaa056..46eb0dc6f0bf8428ca0b5b6989fb6444ca5b2495 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
@@ -47,7 +47,7 @@ class TestFleetExecutor(unittest.TestCase):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
index a1e144da146a57ae277d40b381682bc458cffab7..1d4e079f9f84ad71bb366b7d11516570c9832a98 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
@@ -70,5 +70,17 @@ class TestFunctionalConv1DErrorCase1(TestFunctionalConv1DError):
         self.data_format = "NCL"
 
 
+class TestFunctionalConv1DErrorCase2(TestFunctionalConv1DError):
+    def setUp(self):
+        self.input = np.random.randn(1, 3, 3)
+        self.filter = np.random.randn(3)
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCL"
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
index ab5b9096dcc8ad1c3df4485e3c805abbb3a05eec..d1e3e6df335b002a64d4dc33e5de001dab8c5546 100644
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 from paddle.fluid.op import Operator
+from paddle.nn import clip
 
 
 class TestGetTensorFromSelectedRowsError(unittest.TestCase):
@@ -31,12 +32,12 @@ class TestGetTensorFromSelectedRowsError(unittest.TestCase):
             x_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.get_tensor_from_selected_rows(x=x_data)
+                clip.get_tensor_from_selected_rows(x=x_data)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_SELECTED_ROWS():
-                fluid.layers.get_tensor_from_selected_rows(x=x_var)
+                clip.get_tensor_from_selected_rows(x=x_var)
 
             self.assertRaises(TypeError, test_SELECTED_ROWS)
 
diff --git a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
index db34123d3bdd8f3faf27f5a8ba51ddb881fcbe87..4cb4b5d773b48ded81187c29993ec9912cb56457 100644
--- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
@@ -17,12 +17,8 @@ import unittest
 import numpy as np
 
 import paddle.fluid as fluid
-from paddle.fluid.clip import (
-    GradientClipByGlobalNorm,
-    GradientClipByNorm,
-    GradientClipByValue,
-)
 from paddle.fluid.dygraph.base import to_variable
+from paddle.nn import ClipGradByGlobalNorm, ClipGradByNorm, ClipGradByValue
 
 
 class TestGradClipByGlobalNorm(unittest.TestCase):
@@ -67,7 +63,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
     def get_dygrap_global_norm_result(self):
         with fluid.dygraph.guard():
 
-            gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
+            gloabl_norm_clip = ClipGradByGlobalNorm(self.max_global_norm)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
@@ -142,7 +138,7 @@ class TestGradClipByNorm(unittest.TestCase):
     def get_dygrap_norm_result(self):
         with fluid.dygraph.guard():
 
-            norm_clip = GradientClipByNorm(self.max_norm)
+            norm_clip = ClipGradByNorm(self.max_norm)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
@@ -212,9 +208,7 @@ class TestGradClipByValue(unittest.TestCase):
 
     def get_dygrap_clip_result(self):
         with fluid.dygraph.guard():
-            value_clip = GradientClipByValue(
-                max=self.max_value, min=self.min_value
-            )
+            value_clip = ClipGradByValue(max=self.max_value, min=self.min_value)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 2243ae8c45602a694e1ce79e72cbc033abaf1636..b5b0b20c6f48bc841bd0dfb5f9a61449cadc93bf 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -20,7 +20,7 @@ from fake_reader import fake_imdb_reader
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip
+from paddle.nn.clip import _allow_pure_fp16_global_norm_clip
 
 paddle.enable_static()
 
@@ -173,9 +173,9 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # test whether the output is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
-            fluid.clip.set_gradient_clip(clip)
-            return fluid.clip.append_gradient_clip_ops(params_grads)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
+            paddle.nn.clip.set_gradient_clip(clip)
+            return paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
@@ -183,7 +183,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # test whether the output is right when use grad_clip
     def test_new_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -192,7 +192,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # test whether the output is right when use grad_clip under float64
     def test_new_gradient_clip_fp64(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -201,15 +201,15 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
     # invoke 'set_gradient_clip' in a wrong order
     def test_wrong_API_order(self):
         def backward_func(cost):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
-            fluid.clip.set_gradient_clip(clip)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=5.0)
+            paddle.nn.clip.set_gradient_clip(clip)
             sgd_optimizer = fluid.optimizer.SGD(
                 learning_rate=0.01, grad_clip=clip
             )
             # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
             sgd_optimizer.minimize(cost)
             # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
-            fluid.clip.set_gradient_clip(clip)
+            paddle.nn.clip.set_gradient_clip(clip)
 
         self.backward_and_optimize = backward_func
         for place in self.get_places():
@@ -269,7 +269,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
         with fluid.program_guard(
             main_program=prog, startup_program=startup_program
         ):
-            clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
             x = (
                 fluid.default_main_program()
                 .global_block()
@@ -313,7 +313,7 @@ class TestGradientClipByNorm(TestGradientClip):
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -321,7 +321,7 @@ class TestGradientClipByNorm(TestGradientClip):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
+        clip = paddle.nn.ClipGradByNorm(self.clip_norm)
         x = (
             fluid.default_main_program()
             .global_block()
@@ -371,7 +371,7 @@ class TestGradientClipByValue(TestGradientClip):
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
+            clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -379,7 +379,7 @@ class TestGradientClipByValue(TestGradientClip):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        clip = fluid.clip.GradientClipByValue(self.max, self.min)
+        clip = paddle.nn.ClipGradByValue(self.max, self.min)
         x = (
             fluid.default_main_program()
             .global_block()
@@ -419,7 +419,7 @@ class TestDygraphGradientClip(unittest.TestCase):
             sgd_optimizer = fluid.optimizer.SGD(
                 learning_rate=0.0,
                 parameter_list=linear.parameters(),
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(0.1),
             )
             self.check_clip_result(loss, sgd_optimizer)
 
@@ -430,12 +430,8 @@ class TestDygraphGradientClip(unittest.TestCase):
 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
     def setUp(self):
         self.clip_norm = 0.8
-        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm
-        )
-        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm
-        )
+        self.clip1 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
+        self.clip2 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -476,7 +472,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
     def setUp(self):
         self.clip_norm = 0.8
-        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
+        self.clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
     def setUp(self):
         self.max = 0.2
         self.min = 0.1
-        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
+        self.clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -572,7 +568,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase):
                         params_grads.append((param, param._grad_ivar()))
                 _, grads = zip(*params_grads)
                 # clip grads
-                clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8)
+                clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.8)
                 params_grads = clip(params_grads)
                 _, grads_clip = zip(*params_grads)
                 # param update
@@ -616,7 +612,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase):
                     params_grads.append((param, param._grad_ivar()))
             _, grads = zip(*params_grads)
             # clip grads
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.1)
             params_grads = clip(params_grads)
             _, grads_clip = zip(*params_grads)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index ecb35e8eaf950cb3f88bea4fecf70c42d1f45363..54cba6eb800295e6a69c9e64be53d7798743383a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -361,7 +361,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
         place = fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             model = MyLayer(size, vocab_size, size)
-            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
+            grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
             optimizer = fluid.optimizer.AdamOptimizer(
                 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
             )
@@ -380,7 +380,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
 
         with fluid.dygraph.guard(place):
             model = MyLayer2(size, vocab_size, size)
-            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
+            grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
             optimizer = fluid.optimizer.AdamOptimizer(
                 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
             )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index cea97398d17159aa0756d5e985b77de0db772ddc..5cc7f63eb7883b1dc260445dcd4f9f1a98c28b99 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -52,7 +52,7 @@ class TestSimpleNet(unittest.TestCase):
                     fluid.set_flags(
                         {'FLAGS_sort_sum_gradient': sort_sum_gradient}
                     )
-                    # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    # grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = paddle.to_tensor(input_word)
@@ -91,7 +91,7 @@ class TestSimpleNet(unittest.TestCase):
                     fluid.set_flags(
                         {'FLAGS_sort_sum_gradient': sort_sum_gradient}
                     )
-                    grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = to_variable(input_word)
diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
index 735f62c646b16af1a3033b00a7712591ca8b2503..ee42ce1625feccc80d8ce72a862395b1cdc6f756 100644
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
@@ -187,9 +187,14 @@ class TestPaddingValueTensor3(unittest.TestCase):
             x = paddle.assign(np_x).astype('float32')
             pad_value = paddle.assign([0.0]).astype('float64')
             y = paddle.nn.functional.pad(x, [0, 1, 2, 3], value=pad_value)
+            loss = y.sum()
+            optimize_ops, params_grads = paddle.optimizer.SGD(0.01).minimize(
+                loss
+            )
 
         exe = paddle.static.Executor(paddle.CPUPlace())
-        [pd_out] = exe.run(main_prog, fetch_list=[y])
+        res = exe.run(main_prog, fetch_list=[y] + [g for p, g in params_grads])
+        pd_out = res[0]
         np_out = np.pad(np_x, [(0, 1), (2, 3)], constant_values=0.0)
         np.testing.assert_allclose(pd_out, np_out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index c1ff35222d8ae6dceed96fc31701b37c87070db3..e39648285daba775614a67c8de6ff920f89cb4f8 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -293,9 +293,6 @@ class TestFLOPSAPI(unittest.TestCase):
             )
             == 3 * 12 * 12 * 12 * 2 * 8
         )
-        self.assertTrue(
-            flops('relu', {'X': [[12, 12, 12]]}, {}) == 12 * 12 * 12
-        )
         self.assertTrue(
             flops('softmax', {'X': [[12, 12, 12]]}, {}) == 3 * 12 * 12 * 12
         )
@@ -303,6 +300,56 @@ class TestFLOPSAPI(unittest.TestCase):
             flops('c_embedding', {'Ids': [[12, 12]], 'W': [[12, 12, 3]]}, {})
             == 0
         )
+        self.assertTrue(
+            flops(
+                'elu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'leaky_relu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'prelu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'relu6',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'silu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index a31749d744aead800a038300b3eeafad51b175c7..887ce9ff3f7411bb01115e8d648b53be0ec7de31 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -49,20 +49,20 @@ class TestReshapeOp(OpTest):
 class TestReshapeOp_ZeroDim1(OpTest):
     def init_data(self):
         self.ori_shape = ()
-        self.new_shape = 1
-        self.infered_shape = 1
+        self.new_shape = (1,)
+        self.infered_shape = (1,)
 
 
 class TestReshapeOp_ZeroDim2(OpTest):
     def init_data(self):
         self.ori_shape = ()
-        self.new_shape = -1
-        self.infered_shape = 1
+        self.new_shape = (-1,)
+        self.infered_shape = (1,)
 
 
 class TestReshapeOp_ZeroDim3(OpTest):
     def init_data(self):
-        self.ori_shape = 1
+        self.ori_shape = (1,)
         self.new_shape = ()
         self.infered_shape = ()
 
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index c99aabbf9b5a1492453f9c17bd548cd6a84ed508..19a4711fc5b680cc5a86d4d9729cea233dcd43e3 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -454,6 +454,15 @@ class TestSundryAPI(unittest.TestCase):
         paddle.disable_static()
         self.x = paddle.rand([])
 
+    def test_flip(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.flip(x, axis=[])
+        out.backward()
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
     def test_linear(self):
         x = paddle.randn([3, 2])
         w = paddle.full(shape=[2, 4], fill_value=0.5)
@@ -747,6 +756,105 @@ class TestSundryAPI(unittest.TestCase):
         np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
         np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
 
+    def test_reshape_list(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        out = paddle.reshape(x, [1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1, 1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape_tensor(self):
+        x = paddle.rand([1, 1])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        new_shape = paddle.full([1], 1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = paddle.full([1], -1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape__list(self):
+        x = paddle.rand([])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.reshape_(x, [1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1, 1])
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reshape__tensor(self):
+        x = paddle.rand([1, 1])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        new_shape = paddle.full([1], 1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = paddle.full([1], -1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reverse(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.reverse(x, axis=[])
+        out.backward()
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
     def test_sort(self):
         x1 = paddle.rand([])
         x2 = paddle.rand([])
@@ -789,6 +897,18 @@ class TestSundryAPIStatic(unittest.TestCase):
         paddle.enable_static()
         self.exe = paddle.static.Executor()
 
+    @prog_scope()
+    def test_flip(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.flip(x, axis=[])
+        paddle.static.append_backward(out)
+
+        program = paddle.static.default_main_program()
+        res1, res2 = self.exe.run(program, fetch_list=[x, out])
+        self.assertEqual(res1.shape, ())
+        self.assertEqual(res2.shape, ())
+
     @prog_scope()
     def test_pow_factor(self):
         x = paddle.rand([])
@@ -1027,6 +1147,7 @@ class TestSundryAPIStatic(unittest.TestCase):
         np.testing.assert_array_equal(out3_2, np.asarray(1))
 
     @prog_scope()
+<<<<<<< HEAD
     def test_sort(self):
         x1 = paddle.rand([])
         x1.stop_gradient = False
@@ -1061,6 +1182,78 @@ class TestSundryAPIStatic(unittest.TestCase):
 
         self.assertEqual(res[0].shape, ())
         self.assertEqual(res[1].shape, ())
+=======
+    def test_reshape_list(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+        x4 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+        x4.stop_gradient = False
+
+        out1 = paddle.reshape(x1, [])
+        paddle.static.append_backward(out1)
+
+        out2 = paddle.reshape(x2, [1])
+        paddle.static.append_backward(out2)
+
+        out3 = paddle.reshape(x3, [-1])
+        paddle.static.append_backward(out3)
+
+        out4 = paddle.reshape(x4, [-1, 1])
+        paddle.static.append_backward(out4)
+
+        program = paddle.static.default_main_program()
+        res1, res2, res3, res4 = self.exe.run(
+            program, fetch_list=[out1, out2, out3, out4]
+        )
+        self.assertEqual(res1.shape, ())
+        self.assertEqual(res2.shape, (1,))
+        self.assertEqual(res3.shape, (1,))
+        self.assertEqual(res4.shape, (1, 1))
+
+    @prog_scope()
+    def test_reshape_tensor(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+
+        new_shape = paddle.full([1], 1, "int32")
+        out1 = paddle.reshape(x1, new_shape)
+        paddle.static.append_backward(out1)
+
+        new_shape = paddle.full([1], -1, "int32")
+        out2 = paddle.reshape(x2, new_shape)
+        paddle.static.append_backward(out2)
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out3 = paddle.reshape(x3, new_shape)
+        paddle.static.append_backward(out3)
+
+        program = paddle.static.default_main_program()
+        res1, res2, res3 = self.exe.run(program, fetch_list=[out1, out2, out3])
+        self.assertEqual(res1.shape, (1,))
+        self.assertEqual(res2.shape, (1,))
+        self.assertEqual(res3.shape, (1, 1))
+
+    @prog_scope()
+    def test_reverse(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reverse(x, axis=[])
+        paddle.static.append_backward(out)
+
+        program = paddle.static.default_main_program()
+        res1, res2 = self.exe.run(program, fetch_list=[x, out])
+        self.assertEqual(res1.shape, ())
+        self.assertEqual(res2.shape, ())
+>>>>>>> c123dd1e4032efdbfff0bf0c35a58155f2d6e1d9
 
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
index 9efb334ac7dd5e0618491c98aee1ae0e2c5a83e7..e4982c42e4e100a3008c9431621c505a042d237e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
@@ -131,13 +131,13 @@ class TestClipOpError(unittest.TestCase):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_dtype():
                 x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+                paddle.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
index ebd4354593dba2a66980eb9f6dc30ed5f78fce16..8cb27ecf0992ad534b590946a1734d4947340e5e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -556,6 +556,96 @@ class TestSundryAPI(unittest.TestCase):
         np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
         np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
 
+    def test_reshape_list(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        out = paddle.reshape(x, [1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1, 1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape_tensor(self):
+        x = paddle.rand([1, 1])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        new_shape = paddle.full([], 1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = paddle.full([], -1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape__list(self):
+        x = paddle.rand([])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.reshape_(x, [1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1, 1])
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reshape__tensor(self):
+        x = paddle.rand([1, 1])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        new_shape = paddle.full([1], 1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = paddle.full([1], -1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1, 1])
+
     def test_sort(self):
         x1 = paddle.rand([])
         x2 = paddle.rand([])
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 86cab526398dd4412227a1b99dd562bd4fcb1cbb..52a0f8b4b3c4f4790008ed3224a3696da1f41cda 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -1535,7 +1535,7 @@ class Model:
                 assert isinstance(
                     self._optimizer._grad_clip,
                     (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
-                ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
+                ), "Only ClipGradByNorm and ClipGradByGlobalNorm are supported in amp training with level=O2 currently."
 
         self._adapter._amp_custom_lists = {}
         self._adapter._amp_configs = {}
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index ca4922700b8f49f4b8a3a9222ce0afcdb9228b1f..6bee79b871cd5e721be31545c4037afa6a5668ea 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -15,13 +15,14 @@
 import paddle
 import paddle.distributed as dist
 from paddle.fluid import core, layers
-from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
 from paddle.fluid.dygraph import base as imperative_base
+from paddle.nn import clip
+from paddle.nn.clip import ClipGradBase, _squared_l2_norm
 
 
 class ClipGradForMOEByGlobalNorm(ClipGradBase):
     r"""
-    The Algrithm is the same as paddle.fluid.clip.ClipGradByGlobalNorm
+    The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
     :math:`t\_list` , and limit it to ``clip_norm`` .
 
@@ -113,8 +114,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+                merge_grad = clip.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
             sum_square = _squared_l2_norm(merge_grad)
             if sum_square.dtype == core.VarDesc.VarType.FP16:
                 sum_square_list_fp16.append(sum_square)
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index bc2837fa2fe58f8b2e5dcaddf59e806471823b29..9aa51cd8122e68114e610714672980ba132f9629 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -16,11 +16,11 @@ import os
 
 import paddle
 from paddle.fluid import core, framework, unique_name
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import Variable, name_scope
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.optimizer import Optimizer
+from paddle.nn import ClipGradByGlobalNorm
 
 
 def init_communicator(block, rank, ranks, ring_id):
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 3ec3dba88df2bcbf479a74c785cfbcbe970b7a4e..328b879c5aab62905fac59e752281a4c05cefc44 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -42,10 +42,12 @@ def convert_attr(x, attr):
 def indexable(x, code=None):
     if isinstance(x, Variable):
         return x
-    if hasattr(x, '__len__') and hasattr(x, '__getitem__'):
-        return x
-    if hasattr(x, '__iter__'):
+    elif hasattr(x, '__iter__'):
         return [i for i in x]
+    elif hasattr(x, '__len__') and hasattr(
+        x, '__getitem__'
+    ):  # used for customed type and non-iterable type.
+        return x
     else:
         raise RuntimeError("X can't be convert into indexable.")
 
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 61143175fd4af5070ab72036de7c0cc47778aa43..10eeb6319063c1468b20bc2b03c0528e82b77bf6 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -12,9 +12,1074 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to clip gradient of parameter
-from ..fluid.clip import ClipGradByGlobalNorm  # noqa: F401
-from ..fluid.clip import ClipGradByNorm  # noqa: F401
-from ..fluid.clip import ClipGradByValue  # noqa: F401
+import copy
+import warnings
+
+import paddle
+import paddle.autograd as imperative_base
+from paddle import _C_ops, _legacy_C_ops
+from paddle.common_ops_import import Variable, check_type, default_main_program
+from paddle.fluid import core, framework, layers, unique_name
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.framework import LayerHelper, _non_static_mode, in_dygraph_mode
+from paddle.tensor.layer_function_generator import templatedoc
 
 __all__ = []
+
+
+@templatedoc()
+def clip_by_norm(x, max_norm, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        max_norm(${max_norm_type}): ${max_norm_comment}
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
+    Returns:
+        Tensor:
+
+        out(${out_type}): ${out_comment}
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.nn import clip
+
+            input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
+            reward = clip.clip_by_norm(x=input, max_norm=1.0)
+            # [[0.5, 0.5], [0.5, 0.5]]
+    """
+
+    if in_dygraph_mode():
+        return _C_ops.clip_by_norm(x, max_norm)
+    if _non_static_mode():
+        return _legacy_C_ops.clip_by_norm(x, 'max_norm', max_norm)
+
+    helper = LayerHelper("clip_by_norm", **locals())
+    check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
+    check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
+
+    if name is None:
+        name = unique_name.generate_with_ignorable_key(
+            ".".join([helper.name, 'tmp'])
+        )
+
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False
+    )
+
+    helper.append_op(
+        type="clip_by_norm",
+        inputs={"X": x},
+        attrs={"max_norm": max_norm},
+        outputs={"Out": out},
+    )
+
+    return out
+
+
+@templatedoc()
+def merge_selected_rows(x, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            b = fluid.default_main_program().global_block()
+            var = b.create_var(
+                name="X", dtype="float32", persistable=True,
+                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
+            y = nn.merge_selected_rows(var)
+    """
+    if in_dygraph_mode():
+        return _C_ops.merge_selected_rows(x)
+
+    if _non_static_mode():
+        return _legacy_C_ops.merge_selected_rows(x)
+
+    helper = LayerHelper("merge_selected_rows", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="merge_selected_rows",
+        inputs={"X": x},
+        attrs={},
+        outputs={"Out": out},
+    )
+    return out
+
+
+@templatedoc()
+def get_tensor_from_selected_rows(x, name=None):
+    """
+    Get tensor data from input with SelectedRows type, and outputs a Tensor.
+
+    .. code-block:: text
+
+        input x is SelectedRows:
+           x.rows = [0, 5, 5, 4, 19]
+           x.height = 20
+           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
+
+        Output is LoDTensor:
+           out.shape = [5, 2]
+           out.data = [[1, 1],
+                       [2, 2],
+                       [2, 2],
+                       [3, 3],
+                       [6, 6]]
+
+    Args:
+        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle import nnp.py
+            b = fluid.default_main_program().global_block()
+            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
+            out = nn.get_tensor_from_selected_rows(input)
+    """
+
+    check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
+    if x.type != core.VarDesc.VarType.SELECTED_ROWS:
+        raise TypeError(
+            "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
+        )
+    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='get_tensor_from_selected_rows',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={},
+    )
+    return out
+
+
+_clip_by_global_norm_using_mp_type_flag = False
+
+
+def _clip_by_global_norm_using_mp_type(*args):
+    global _clip_by_global_norm_using_mp_type_flag
+    assert len(args) <= 1
+    if len(args) == 1:
+        assert isinstance(args[0], bool)
+        old_value = _clip_by_global_norm_using_mp_type_flag
+        _clip_by_global_norm_using_mp_type_flag = args[0]
+        return old_value
+    else:
+        return _clip_by_global_norm_using_mp_type_flag
+
+
+def _cast_to_mp_type_if_enabled(x):
+    if (
+        x.dtype == core.VarDesc.VarType.FP16
+        or x.dtype == core.VarDesc.VarType.BF16
+    ) and _clip_by_global_norm_using_mp_type():
+        return x.astype(core.VarDesc.VarType.FP32)
+    else:
+        return x
+
+
+def _squared_l2_norm(x):
+    r"""
+    Return the squared L2 norm of a tensor.
+    """
+
+    x = _cast_to_mp_type_if_enabled(x)
+    if (
+        core.is_compiled_with_xpu()
+        or x.dtype == core.VarDesc.VarType.FP16
+        or x.dtype == core.VarDesc.VarType.BF16
+    ):
+        square = paddle.square(x)
+        sum_square = paddle.sum(square)
+        return sum_square
+
+    if in_dygraph_mode():
+        return _C_ops.squared_l2_norm(x)
+
+    op_type = 'squared_l2_norm'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+
+    inputs = {"X": x}
+    outputs = {'Out': out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
+
+
+class BaseErrorClipAttr:
+    def __str__(self):
+        raise NotImplementedError()
+
+    def _append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    r"""
+    Clip tensor values to the range [min, max].
+
+    Given a tensor ``t`` (see Examples below), this operation clips its value \
+    to ``min`` and ``max`` inplace.
+
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to ``-max`` by framework.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            BATCH_SIZE = 128
+            CLIP_MAX = 2e-6
+            CLIP_MIN = -1e-6
+            prog = fluid.framework.Program()
+            with fluid.program_guard(main_program=prog):
+                image = fluid.layers.data(
+                    name='x', shape=[784], dtype='float32')
+                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+                predict = fluid.layers.fc(
+                    input=hidden2, size=10, act='softmax')
+                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+                cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
+                avg_cost = paddle.mean(cost)
+            prog_clip = prog.clone()
+            prog_clip.block(0).var(hidden1.name)._set_error_clip(
+                paddle.nn.clip.ErrorClipByValue(
+                    max=CLIP_MAX, min=CLIP_MIN)
+                    )
+    """
+
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
+    def _append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc._set_attr("min", self.min)
+        clip_op_desc._set_attr("max", self.max)
+
+
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
+        fwd_var = block._var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (
+            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
+        ):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip._append_clip_op(block, grad_n)
+
+
+class ClipGradBase:
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        raise NotImplementedError()
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        raise NotImplementedError
+
+    def _static_clip(self, params_grads):
+        raise NotImplementedError
+
+    def __call__(self, params_grads):
+        if _non_static_mode():
+            return self._dygraph_clip(params_grads)
+        else:
+            for p, g in params_grads:
+                if getattr(p, 'gradient_clip_attr', None) is not None:
+                    warnings.warn(
+                        "'set_gradient_clip' will be ineffective, because you have "
+                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
+                        "is redundant and you can remove it."
+                    )
+                    break
+            return self._static_clip(params_grads)
+
+    def _process_context(self, context, param, grad):
+        raise NotImplementedError()
+
+    def _create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class ClipGradByValue(ClipGradBase):
+    """
+    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
+
+    - Any values less than min are set to ``min``.
+
+    - Any values greater than max are set to ``max``.
+
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    Note:
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
+            automatically. In this case, ``max`` must be greater than 0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(self, max, min=None):
+        super().__init__()
+        if min is None:
+            assert max > 0.0
+            min = -max
+        self.max = float(max)
+        self.min = float(min)
+
+    def __str__(self):
+        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = paddle.clip(x=g, min=self.min, max=self.max)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        param_new_grad_name_dict = dict()
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = paddle.clip(x=g, min=self.min, max=self.max)
+                params_and_grads.append((p, new_grad))
+                param_new_grad_name_dict[p.name] = new_grad.name
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        pass
+
+    def _create_operators(self, param, grad):
+        new_grad = paddle.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+class ClipGradByNorm(ClipGradBase):
+    r"""
+    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
+
+    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
+
+    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
+
+    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    The clipping formula is:
+
+    .. math::
+        Out =
+        \left\{
+            \begin{array}{ccl}
+                X & & if (norm(X) \leq clip\_norm) \\
+                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
+        \end{array}
+        \right.
+
+
+    where :math:`norm(X)` represents the L2 norm of :math:`X`.
+
+    .. math::
+        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
+
+    Note:
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        clip_norm(float): The maximum norm value.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(self, clip_norm):
+        super().__init__()
+        self.clip_norm = float(clip_norm)
+
+    def __str__(self):
+        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        with framework.name_scope('gradient_clip'):
+            param_new_grad_name_dict = dict()
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
+                param_new_grad_name_dict[p.name] = new_grad.name
+                params_and_grads.append((p, new_grad))
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        pass
+
+    def _create_operators(self, param, grad):
+        new_grad = clip_by_norm(x=grad, max_norm=self.clip_norm)
+        return param, new_grad
+
+
+_allow_pure_fp16_global_norm_clip_flag = False
+
+
+def _allow_pure_fp16_global_norm_clip(*args):
+    global _allow_pure_fp16_global_norm_clip_flag
+    if len(args) == 0:
+        return _allow_pure_fp16_global_norm_clip_flag
+    else:
+        assert len(args) == 1 and isinstance(args[0], bool)
+        old_value = _allow_pure_fp16_global_norm_clip_flag
+        _allow_pure_fp16_global_norm_clip_flag = args[0]
+        return old_value
+
+
+class ClipGradByGlobalNorm(ClipGradBase):
+    r"""
+    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
+    :math:`t\_list` , and limit it to ``clip_norm`` .
+
+    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
+
+    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
+
+    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    The clipping formula is:
+
+    .. math::
+
+        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+
+    where:
+
+    .. math::
+
+        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
+
+    Note:
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        clip_norm (float): The maximum norm value.
+        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
+        auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(
+        self, clip_norm, group_name="default_group", auto_skip_clip=False
+    ):
+        super().__init__()
+        self.clip_norm = float(clip_norm)
+        self.group_name = group_name
+        assert isinstance(auto_skip_clip, bool)
+        self.auto_skip_clip = auto_skip_clip
+
+    def __str__(self):
+        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        sum_square_list_fp16 = []
+        sum_square_list_fp32 = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+
+            if in_dygraph_mode() and g.is_selected_rows():
+                merge_grad = merge_selected_rows(g)
+                merge_grad = merge_grad._get_tensor_from_selected_rows()
+
+            elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = merge_selected_rows(g)
+                merge_grad = get_tensor_from_selected_rows(merge_grad)
+
+            sum_square = _squared_l2_norm(merge_grad)
+            if (
+                sum_square.dtype == core.VarDesc.VarType.FP16
+                or sum_square.dtype == core.VarDesc.VarType.BF16
+            ):
+                sum_square_list_fp16.append(sum_square)
+            elif sum_square.dtype == core.VarDesc.VarType.FP32:
+                sum_square_list_fp32.append(sum_square)
+            else:
+                sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if (
+            len(sum_square_list)
+            + len(sum_square_list_fp16)
+            + len(sum_square_list_fp32)
+            == 0
+        ):
+            return params_grads
+
+        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
+        global_norm_var = []
+        if len(sum_square_list_fp16) > 0:
+            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
+            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
+        if len(sum_square_list_fp32) > 0:
+            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
+            if sum_dtype == 'float32':
+                global_norm_var.append(global_norm_var_fp32)
+            else:
+                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
+        if len(sum_square_list) > 0:
+            global_norm_var_fp64 = paddle.add_n(sum_square_list)
+            global_norm_var.append(global_norm_var_fp64)
+        global_norm_var = paddle.add_n(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
+        max_global_norm = paddle.full(
+            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
+        )
+
+        need_clip = False
+        if not self.auto_skip_clip:  # always apply clip
+            need_clip = True
+            clip_var = paddle.divide(
+                x=max_global_norm,
+                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
+            )
+        elif global_norm_var > max_global_norm:
+            # only when global_norm_var > max_global_norm, grad need clip
+            need_clip = True
+            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
+
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            # TODO(wangxi): use inplace elementwise_mul
+            if need_clip:
+                clip_input = (
+                    clip_var.astype(g.dtype)
+                    if clip_var.dtype != g.dtype
+                    else clip_var
+                )
+                new_grad = paddle.multiply(g, clip_input)
+                params_and_grads.append((p, new_grad))
+            else:
+                params_and_grads.append((p, g))
+
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        sum_square_list_fp16 = []
+        sum_square_list_fp32 = []
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    continue
+                merge_grad = g
+                with p.block.program._optimized_guard([p, g]):
+                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                        merge_grad = merge_selected_rows(g)
+                        merge_grad = get_tensor_from_selected_rows(merge_grad)
+                    sum_square = _squared_l2_norm(merge_grad)
+                    if sum_square.dtype == core.VarDesc.VarType.FP16:
+                        sum_square_list_fp16.append(sum_square)
+                    elif sum_square.dtype == core.VarDesc.VarType.FP32:
+                        sum_square_list_fp32.append(sum_square)
+                    else:
+                        sum_square_list.append(sum_square)
+
+            # all parameters have been filterd out
+            if (
+                len(sum_square_list)
+                + len(sum_square_list_fp16)
+                + len(sum_square_list_fp32)
+                == 0
+            ):
+                return params_grads
+
+            with p.block.program._optimized_guard([p, g]):
+                sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
+
+                global_norm_var = []
+                if len(sum_square_list_fp16) > 0:
+                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
+                    if (
+                        sum_square_list_fp32
+                        or sum_square_list
+                        or not _allow_pure_fp16_global_norm_clip()
+                    ):
+                        global_norm_var.append(
+                            global_norm_var_fp16.astype(sum_dtype)
+                        )
+                    else:
+                        global_norm_var.append(global_norm_var_fp16)
+                if len(sum_square_list_fp32) > 0:
+                    global_norm_var_fp32 = layers.sums(sum_square_list_fp32)
+                    if sum_dtype == 'float32':
+                        global_norm_var.append(global_norm_var_fp32)
+                    else:
+                        global_norm_var.append(
+                            global_norm_var_fp32.astype(sum_dtype)
+                        )
+                if len(sum_square_list) > 0:
+                    # fp64
+                    global_norm_var_other_dtype = layers.sums(sum_square_list)
+                    global_norm_var.append(global_norm_var_other_dtype)
+
+                global_norm_var = (
+                    layers.sums(global_norm_var)
+                    if len(global_norm_var) > 1
+                    else global_norm_var[0]
+                )
+                global_norm_var = paddle.sqrt(x=global_norm_var)
+                max_global_norm = paddle.full(
+                    shape=[1],
+                    dtype=global_norm_var.dtype,
+                    fill_value=self.clip_norm,
+                )
+                scale_var = paddle.divide(
+                    x=max_global_norm,
+                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
+                )
+            param_new_grad_name_dict = dict()
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_g = _cast_to_mp_type_if_enabled(g)
+                    # inplace
+                    scale_input = (
+                        scale_var.astype('float16')
+                        if new_g.dtype == core.VarDesc.VarType.FP16
+                        and scale_var.dtype != core.VarDesc.VarType.FP16
+                        else scale_var
+                    )
+                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
+                    # will be in different blocks with the gradient clip related ops.
+                    # We need to handle the correct block, otherwise will encounter
+                    # a 'NotFoundError' during compile time.
+                    block = default_main_program().current_block()
+                    block.append_op(
+                        type='elementwise_mul',
+                        inputs={'X': new_g, 'Y': scale_input},
+                        outputs={'Out': new_g},
+                    )
+                    if new_g is not g:
+                        block.append_op(
+                            type='cast',
+                            inputs={'X': new_g},
+                            outputs={'Out': g},
+                            attrs={
+                                'in_dtype': new_g.dtype,
+                                'out_dtype': g.dtype,
+                            },
+                        )
+
+                param_new_grad_name_dict[p.name] = g.name
+                params_and_grads.append((p, g))
+
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        if self.group_name not in context:
+            context[self.group_name] = []
+            context[self.group_name + "_clip_value"] = self.clip_norm
+            context[self.group_name + "_clip"] = paddle.full(
+                shape=[1], dtype=grad.dtype, fill_value=self.clip_norm
+            )
+        else:
+            if not self.clip_norm == context[self.group_name + "_clip_value"]:
+                raise ValueError(
+                    "All parameters' 'clip_norm' of a same group should be the same"
+                )
+
+        merge_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            merge_grad = merge_selected_rows(grad)
+            merge_grad = get_tensor_from_selected_rows(merge_grad)
+
+        local_norm_var = _squared_l2_norm(merge_grad)
+        context[self.group_name].append(local_norm_var)
+
+        self.context = context
+
+    def _create_operators(self, param, grad):
+        group_scale_name = self.group_name + "_scale"
+        if group_scale_name not in self.context:
+            group_norm_var = layers.sums(input=self.context[self.group_name])
+            group_norm_var = paddle.sqrt(x=group_norm_var)
+            clip_var = self.context[self.group_name + "_clip"]
+            group_scale_var = paddle.divide(
+                x=clip_var,
+                y=paddle.maximum(x=clip_var, y=group_norm_var),
+            )
+            assert group_scale_var.shape == (1,)
+            self.context[group_scale_name] = group_scale_var
+
+        # inplace
+        param.block.append_op(
+            type='elementwise_mul',
+            inputs={'X': grad, 'Y': self.context[group_scale_name]},
+            outputs={'Out': grad},
+        )
+
+        return param, grad
+
+
+@framework.dygraph_not_support
+def set_gradient_clip(clip, param_list=None, program=None):
+    """
+    Warning:
+
+        This API must be used after building network, and before ``minimize`` ,
+        and it may be removed in future releases, so it is not recommended.
+        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
+        this is a better method to clip gradient. There are three clipping strategies:
+         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+         :ref:`api_fluid_clip_GradientClipByValue` .
+
+    To specify parameters that require gradient clip.
+
+    Args:
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
+            gradient clipping.
+        param_list (list(Variable), optional): Parameters that require gradient clip.
+                It can be a list of parameter or a list of parameter's name.
+                Default None, meaning that all parameters in the program will be included.
+        program (Program, optional): The program where parameters are located.
+                Default None, meaning that using :ref:`api_fluid_default_main_program` .
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+
+            paddle.enable_static()
+
+            def network():
+                image = fluid.data(name='image', shape=[
+                                   None, 28], dtype='float32')
+                param_attr1 = fluid.ParamAttr("fc1_param")
+                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
+                param_attr2 = fluid.ParamAttr("fc2_param")
+                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
+                loss = paddle.mean(fc2)
+                return loss
+
+
+            # network 1: clip all parameter gradient
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 2: clip parameter gradient by name
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
+                    param_list=["fc1_param", "fc2_param"])
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 3: clip parameter gradient by value
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
+                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
+                    param_list=[param_var1, param_var2])
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
+                clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
+                # Set the gradient clipping strategy: clip1
+                paddle.nn.clip.set_gradient_clip(clip1)
+                # Set the gradient clipping strategy: clip2
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
+                sgd.minimize(loss)
+                # 'set_gradient_clip' will not take effect when setting has a conflict,
+                # and the gradient clipping strategy will be 'clip2'
+
+
+    """
+    warnings.warn(
+        "Caution! 'set_gradient_clip' is not recommended "
+        "and may be deprecated in future! "
+        "We recommend a new strategy: set 'grad_clip' "
+        "when initializing the 'optimizer'. "
+        "This method can reduce the mistakes, please "
+        "refer to documention of 'optimizer'."
+    )
+
+    if not isinstance(clip, ClipGradBase):
+        raise TypeError(
+            "'clip' should be an instance of ClipGradBase's derived class"
+        )
+    if program is None:
+        program = framework.default_main_program()
+
+    for op in program.block(0).ops:
+        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
+            "op_namescope"
+        ):
+            warnings.warn(
+                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
+                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
+            )
+            break
+
+    if param_list is None:
+        param_list = program.block(0).all_parameters()
+    if all(isinstance(elem, str) for elem in param_list):
+        param_list = [program.block(0).var(elem) for elem in param_list]
+    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
+        raise TypeError(
+            "'param_list' should be a list of Parameter or basestring(parameter's name)."
+        )
+
+    for param in param_list:
+        param.gradient_clip_attr = copy.deepcopy(clip)
+
+
+def append_gradient_clip_ops(param_grads):
+    context = dict()
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
+            clip_attr = getattr(p, 'gradient_clip_attr', None)
+            if clip_attr is None:
+                return param_grads
+            if not isinstance(clip_attr, ClipGradBase):
+                raise TypeError(
+                    "clip attribute should be an instance of GradientClipBase"
+                )
+
+            clip_attr._process_context(context=context, param=p, grad=g)
+
+    res = []
+    param_new_grad_name_dict = dict()
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
+            param, new_grad = clip_attr._create_operators(param=p, grad=g)
+            param_new_grad_name_dict[param.name] = new_grad.name
+            res.append([param, new_grad])
+
+    _correct_clip_op_role_var(res, param_new_grad_name_dict)
+    return res
+
+
+# change wrong mapping relation between param & grad in clip op
+# Note: This function is sensitive to the time cost of the network with gradient clipping
+# and should not be changed easily. If you must change, please test the time cost.
+def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
+    block_id_list = []
+    if len(param_new_grad_name_dict) == 0:
+        return
+    for param, grad in params_grads:
+        if grad is None:
+            continue
+        block_id = param.block.idx
+        if block_id in block_id_list:
+            continue
+        block_id_list.append(block_id)
+        for op in param.block.program.global_block().ops:
+            if (
+                op.has_attr("op_namescope")
+                and "gradient_clip" in op.attr("op_namescope")
+                and op.attr('op_role_var')
+            ):
+                param_name = op.attr('op_role_var')[0]
+                if param_name in param_new_grad_name_dict:
+                    correct_p_g = [
+                        param_name,
+                        param_new_grad_name_dict[param_name],
+                    ]
+                    op._set_attr('op_role_var', correct_p_g)
+
+
+GradientClipBase = ClipGradBase
+GradientClipByValue = ClipGradByValue
+GradientClipByNorm = ClipGradByNorm
+GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index ceaa6e5e4a8dbc1514177390af76fc1a5ba213f0..74a97e25938ed300620dcb997205985176f74ca8 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -984,6 +984,13 @@ def conv1d_transpose(
             )
         )
 
+    if len(weight.shape) != 3:
+        raise ValueError(
+            'Input weight should be 3D tensor, but received weight with the shape of {}'.format(
+                weight.shape
+            )
+        )
+
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
     if (
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index f03b3af2df97084e1bc2e5bd9d67b1442a19d3ee..a4d304b451e7b3cad3fdab97bf05e7854146a260 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -20,10 +20,10 @@ import paddle
 
 from .. import _C_ops
 from ..fluid import core, framework, unique_name
-from ..fluid.clip import GradientClipBase
 from ..fluid.dygraph import base as imperative_base
 from ..fluid.framework import Parameter, Variable
 from ..fluid.layer_helper import LayerHelper
+from ..nn.clip import GradientClipBase
 from .lr import LRScheduler
 from .optimizer import Optimizer
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index d5f18130a4c63e0883638773cf015872d2b22288..1799461254ced546eb35ac119d0cf893169c854e 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -18,6 +18,7 @@ from collections import defaultdict
 import numpy as np
 
 import paddle
+import paddle.autograd as imperative_base
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid.framework import (
@@ -32,12 +33,6 @@ from paddle.fluid.framework import (
 
 from ..fluid import framework, unique_name
 from ..fluid.backward import _get_no_grad_set_name, append_backward
-from ..fluid.clip import (
-    GradientClipBase,
-    append_gradient_clip_ops,
-    error_clip_callback,
-)
-from ..fluid.dygraph import base as imperative_base
 from ..fluid.framework import Parameter, program_guard
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
@@ -168,7 +163,7 @@ class Optimizer:
 
     """
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def __init__(
         self,
         learning_rate,
@@ -225,7 +220,7 @@ class Optimizer:
                 % type(learning_rate)
             )
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipBase):
+            if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
                 raise TypeError(
                     "'grad_clip' should be an instance of GradientClipBase's derived class"
                 )
@@ -1042,7 +1037,7 @@ class Optimizer:
                     params_grads.append((parameter_list[index], grad))
         else:
             if callbacks is None:
-                callbacks = [error_clip_callback]
+                callbacks = [paddle.nn.clip.error_clip_callback]
             else:
                 assert isinstance(callbacks, list)
             program = loss.block.program
@@ -1103,7 +1098,7 @@ class Optimizer:
             params_grads = self._grad_clip(params_grads)
         else:
 
-            params_grads = append_gradient_clip_ops(params_grads)
+            params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
         params_grads = self.append_regularization_ops(
@@ -1317,7 +1312,7 @@ class Optimizer:
         else:
             core.clear_gradients(param_list, set_to_zero)
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def minimize(
         self, loss, startup_program=None, parameters=None, no_grad_set=None
     ):
@@ -1380,7 +1375,7 @@ class Optimizer:
 
         return optimize_ops, params_grads
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     @framework.dygraph_only
     def step(self):
         """
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9b6d0fecf617247de4cbb2237db85923c23f1b8f..842deaac991a9c0ef006d21175e86e6c7b5767a4 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3450,7 +3450,7 @@ def reshape(x, shape, name=None):
     Args:
         x (Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``
         shape (list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
-                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
+                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [].
                         If ``shape`` is an Tensor, it should be an 1-D Tensor .
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -3574,10 +3574,6 @@ def reshape(x, shape, name=None):
             shape.stop_gradient = True
             inputs["Shape"] = shape
         elif isinstance(shape, (list, tuple)):
-            assert len(shape) > 0, (
-                "The size of 'shape' in reshape can't be zero, "
-                "but received %s." % len(shape)
-            )
             attrs["shape"] = get_attr_shape(shape)
             if utils._contain_var(shape):
                 inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape)
diff --git a/python/paddle/utils/flops.py b/python/paddle/utils/flops.py
index a930e0ef5488d8a492b76a69cd02c633c695f0ed..71f54ee29cbe9f29ef35a3ee836a875feb83d73e 100644
--- a/python/paddle/utils/flops.py
+++ b/python/paddle/utils/flops.py
@@ -73,7 +73,7 @@ def _c_embedding_flops(input_shapes, attrs):
 def _dropout_flops(input_shapes, attrs):
     """FLOPs computation for dropout op.
     For dropout(input):
-    equation: flops = 0
+        equation: flops = 0
     """
     return 0
 
@@ -191,7 +191,7 @@ def _matmul_v2_flops(input_shapes, attrs):
     """FLOPs computation for matmul_v2 op.
     For matmul_v2(input,other):
         input_shapes = [shape_of_input, shape_of_ohther]
-        shape_of_input =                  [dim1, dim2 ...dim_n_1, dim_n] length:n
+        shape_of_input =                   [dim1, dim2 ...dim_n_1, dim_n] length:n
         shape_of_other = [odim1, odim2 ... odim(n-m) ... odim_m_1, dim_m] length:m
         suppose n > m and dim_n = odim_m_1:
         shape_of_output = [dim1, dim2 ... max(dim(n-m), odim(n-m)), max(dim(n-m+1), odim(n-m+1))...dim_n_1, dim_m]
@@ -216,13 +216,43 @@ def _matmul_v2_flops(input_shapes, attrs):
     return 2 * macs
 
 
-@register_flops("relu")
-def _relu_flops(input_shapes, attrs):
-    """FLOPs computation for relu op.
-    For relu(input):
+def _relu_class_flops(input_shapes, attrs):
+    """FLOPs computation for relu_like ops.
+    For elu/leaky_relu/prelu/relu/relu6/silu (input):
         equation: flops = (numel)total number of elements in the input tensor.
     """
-    return prod(input_shapes.get('X')[0])
+    input = input_shapes.get('X')[0]
+    return prod(input)
+
+
+@register_flops("elu")
+def _elu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("leaky_relu")
+def _leaky_relu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("prelu")
+def _prelu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("relu")
+def _relu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("relu6")
+def _relu6_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("silu")
+def _silu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
 
 
 @register_flops("reshape2")
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
index 8cdf32f348f090b14b7d33284b400f708f611d1c..b3b1df8afe29a5ceecf57ee5d43d3cdba522cc96 100644
--- a/tools/get_single_test_cov.py
+++ b/tools/get_single_test_cov.py
@@ -12,12 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 import re
+import subprocess
 import sys
 
 
 def getFNDAFile(rootPath, test):
+    # load base fnda
+    fnda_base_dict = {}
+    find_file_cmd = os.popen("find %s -name %s.cc" % (rootPath, test))
+    if find_file_cmd.read() != "":
+        print("%s is a c++ unittest" % test)
+        with open(
+            "%s/build/ut_map/simple_precision_test/base_fnda.json" % rootPath,
+            'r',
+        ) as load_f:
+            fnda_base_dict = json.load(load_f)
+    # analyse fnda
     filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test)
     fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test)
     os.system('touch %s' % fn_filename)
@@ -27,15 +40,28 @@ def getFNDAFile(rootPath, test):
     except FileNotFoundError:
         print("%s is not found." % filename)
         return
-    lines = f.readlines()
-    for line in lines:
-        line = line.replace('\n', '')
-        if line.startswith(('SF:')):
-            os.system('echo %s >> %s' % (line, fn_filename))
-        elif line.startswith(('FNDA:')):
-            hit = int(line.split('FNDA:')[1].split(',')[0])
-            if hit != 0:
-                os.system('echo %s >> %s' % (line, fn_filename))
+    all_data = f.read().split('TN:')
+    del all_data[0]
+    for gcov_data in all_data:
+        message_list = gcov_data.split('\n')
+        os.system('echo %s >> %s' % (message_list[1], fn_filename))
+        if 'FNH:0' not in gcov_data:
+            for message in message_list:
+                if message.startswith(('FNDA:')) and (
+                    not message.startswith(('FNDA:0,'))
+                ):
+                    tmp_data = message.split('FNDA:')[1].split(',')
+                    hit = int(tmp_data[0])
+                    symbol = tmp_data[1]
+                    if symbol in fnda_base_dict:
+                        if (hit - fnda_base_dict[symbol]) > 0:
+                            fnda_str = 'FNDA:%s,%s' % (
+                                str(hit - fnda_base_dict[symbol]),
+                                symbol,
+                            )
+                            os.system('echo %s >> %s' % (fnda_str, fn_filename))
+                    else:
+                        os.system('echo %s >> %s' % (message, fn_filename))
     f.close()
 
 
@@ -112,10 +138,55 @@ def analysisFNDAFile(rootPath, test):
     f.close()
 
 
+def getBaseFnda(rootPath, test):
+    filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test)
+    try:
+        f = open(filename)
+        print("oepn %s succesfully" % filename)
+    except FileNotFoundError:
+        print("%s is not found." % filename)
+    symbol_fnda = {}
+    all_data = f.read().split('TN:')
+    del all_data[0]
+    for gcov_data in all_data:
+        message_list = gcov_data.split('\n')
+        # only for cc file
+        if ".cc" in message_list[1]:
+            for message in message_list:
+                if message.startswith(('FNDA:')) and (
+                    not message.startswith(('FNDA:0,'))
+                ):
+                    tmp_data = message.split('FNDA:')[1].split(',')
+                    symbol_fnda[tmp_data[1]] = int(tmp_data[0])
+    f.close()
+
+    with open("%s/build/ut_map/%s/base_fnda.json" % (rootPath, test), "w") as f:
+        json.dump(symbol_fnda, f, indent=4)
+
+
 def getCovinfo(rootPath, test):
     ut_map_path = '%s/build/ut_map/%s' % (rootPath, test)
+    print("start get fluid ===>")
+    cmd_fluid = 'lcov --capture -d ./paddle/fluid/ -o ./paddle/fluid/coverage_fluid.info --rc lcov_branch_coverage=0'
+    p_fluid = subprocess.Popen(cmd_fluid, shell=True, stdout=subprocess.DEVNULL)
+
+    print("start get phi ===>")
+    cmd_phi = 'lcov --capture -d ./paddle/phi -o ./paddle/phi/coverage_phi.info --rc lcov_branch_coverage=0'
+    p_phi = subprocess.Popen(cmd_phi, shell=True, stdout=subprocess.DEVNULL)
+
+    print("start get utils ===>")
+    cmd_utils = 'lcov --capture -d ./paddle/utils -o ./paddle/utils/coverage_utils.info --rc lcov_branch_coverage=0'
+    p_utils = subprocess.Popen(cmd_utils, shell=True, stdout=subprocess.DEVNULL)
+
+    print("start wiat fluid ===>")
+    p_fluid.wait()
+    print("start wiat phi ===>")
+    p_phi.wait()
+    print("start wiat utils ===>")
+    p_utils.wait()
+    print("end wait...")
     os.system(
-        'cd %s && lcov --capture -d . -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1'
+        'cd %s && lcov -a paddle/fluid/coverage_fluid.info -a paddle/phi/coverage_phi.info -a paddle/utils/coverage_utils.info -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1'
         % ut_map_path
     )
     coverage_info_path = ut_map_path + '/coverage.info'
@@ -139,8 +210,11 @@ def getCovinfo(rootPath, test):
 
     os.system('rm -rf %s/paddle' % ut_map_path)
     os.system('rm -rf %s/coverage.info' % ut_map_path)
-    getFNDAFile(rootPath, test)
-    analysisFNDAFile(rootPath, test)
+    if test == "simple_precision_test":
+        getBaseFnda(rootPath, test)
+    else:
+        getFNDAFile(rootPath, test)
+        analysisFNDAFile(rootPath, test)
     os.system('rm -rf %s/coverage.info.tmp' % ut_map_path)
 
 
diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py
index 1a096fa894e4630749d8773447b18226a5e607df..a33c1cd66811919b5276446af858f68c39d46ffa 100644
--- a/tools/handle_h_cu_file.py
+++ b/tools/handle_h_cu_file.py
@@ -72,6 +72,31 @@ def insert_pile_to_h_file(rootPath):
         os.system('echo "\n#endif" >> %s' % line)
 
 
+def add_simple_cxx_test(rootPath):
+    variant_test_path = '%s/paddle/utils/variant_test.cc' % rootPath
+    variant_test_cmakeflie_path = '%s/paddle/utils/CMakeLists.txt' % rootPath
+    if os.path.exists(variant_test_path) and os.path.exists(
+        variant_test_cmakeflie_path
+    ):
+        simple_test_path = '%s/paddle/utils/simple_precision_test.cc' % rootPath
+        os.system('touch %s' % simple_test_path)
+        os.system(
+            "echo '#include \"gtest/gtest.h\"\n' >> %s" % simple_test_path
+        )
+        os.system(
+            'echo "TEST(interface_test, type) { }\n" >> %s' % simple_test_path
+        )
+        os.system('echo "cc_test(" >> %s' % variant_test_cmakeflie_path)
+        os.system(
+            'echo "  simple_precision_test" >> %s' % variant_test_cmakeflie_path
+        )
+        os.system(
+            'echo "  SRCS simple_precision_test.cc" >> %s'
+            % variant_test_cmakeflie_path
+        )
+        os.system('echo "  DEPS gtest)\n" >> %s' % variant_test_cmakeflie_path)
+
+
 def remove_pile_from_h_file(rootPath):
     h_cu_files = '%s/tools/h_cu_files.log' % rootPath
     f = open(h_cu_files)
@@ -130,6 +155,7 @@ if __name__ == "__main__":
     elif func == 'insert_pile_to_h_file':
         rootPath = sys.argv[2]
         insert_pile_to_h_file(rootPath)
+        add_simple_cxx_test(rootPath)
     elif func == 'analy_h_cu_file':
         dir_path = sys.argv[2]
         rootPath = sys.argv[3]
diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh
index 011ac564cf91ba41b4c851ce7c52187658c359b8..3c6f0e140f250adab7e4f5c6ca1c1ba714473657 100755
--- a/tools/nvcc_lazy.sh
+++ b/tools/nvcc_lazy.sh
@@ -65,12 +65,14 @@ echo "sed -i -e '/LIBRARIES=/{s/\s//g;s/\"\"/ /g}' \${BUILDSH}.pre" >> $1
 echo -e >> $1
 echo "/usr/bin/env bash \${BUILDSH}.pre" >> $1
 echo "STUBF=\$(find \$BUILDDIR -name *.cudafe1.stub.c)" >> $1
-echo "CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1
-echo "sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1
-echo "sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1
-echo "# sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1
-echo "sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1
-echo "sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1
-echo "sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1
+echo "if [ ! -z \"\$STUBF\" ]; then" >> $1
+echo "  CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1
+echo "  sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1
+echo "  sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1
+echo "  # sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "  sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "  sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1
+echo "  sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1
+echo "fi" >> $1
 echo "/usr/bin/env bash \${BUILDSH}.post" >> $1
 echo "rm -rf \$BUILDDIR" >> $1