diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 0c54201c94034f4ceaca1ba720dce22a81fe417d..f9411aa73fad49daa11cf0573c44664b6b01933b 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -75,6 +75,11 @@ class Carrier final {
 
   bool IsInit() const;
 
+  // NOTE: This mutex will be used in interceptor's RunOps function.
+  // This mutex is used for avoiding forward ops and backward ops run
+  // simultaneously, which will lead to a random hang for some sync ops.
+  std::mutex run;
+
   DISABLE_COPY_AND_ASSIGN(Carrier);
 
  private:
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 35905125a0a430e4f30d32ac367cb268722f9c85..98583de84e7ea9c0fef595b097e7fe9116142c81 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }
 
 void ComputeInterceptor::RunOps() {
+  Carrier& carrier_instance = Carrier::Instance();
+  std::unique_lock<std::mutex> lock(carrier_instance.run);
   VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
           << step_ + 1 << " time.";
   for (auto op : node_->ops()) {
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index 9d475d96e56ce0d06768568f159a4c7630b5bea4..723bf5387c60a9552f74aecc378b0cafed957d96 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
+#ifdef PADDLE_WITH_IPU
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
   void operator()(const paddle::platform::NPUPinnedPlace& place) {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 71b53b8a51882fbb3a130737e5b80a5460bad2cb..5e450234c405cd9a9ade2e89978ce9566e4d8d67 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
     return device;
   }
 
+  inline ::DLDevice operator()(const platform::IPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::IPUPlace is not supported"));
+  }
+
   inline ::DLDevice operator()(const platform::XPUPlace &place) const {
     PADDLE_THROW(
         platform::errors::Unimplemented("platform::XPUPlace is not supported"));
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 93f4f8952fc675022ce6e142ac10b194958bd238..9e572614779916bba54dd354c6039c7741dab8bc 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_ipu_place(place_)) {
+#ifdef PADDLE_WITH_IPU
+      gc.reset(new IPUGarbageCollector(
+          BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
 #endif
     } else if (platform::is_npu_place(place_)) {
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b98a22886826603bae41b7690992d0cae215f28e..08055cd9a5407bd58989619da7cd729c6283b5ca 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
 cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
+cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
 cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass)
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
index d3cf3319adfc5eaed5ce285bef86b81991d7350a..0bf30c29f32793ed5cb226276874b18e6a76b3d5 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(fc_elementwise_layernorm_fuse_pass,
               paddle::framework::ir::FCElementwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(fc_elementwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .LE("elementwise_add", 1)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9dcbbb9c9856e5e40d4d79bebc3cdac42cf407c8
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AvgShardPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter AvgShardPass::ApplyImpl";
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  if (ipu_backend->GetIpuStrategy()->need_avg_shard) {
+    VLOG(10) << "start AvgShardPass";
+    auto nodes = ir::TopologySortOperations(*graph);
+    auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus;
+
+    int shard_position = nodes.size() / num_ipus;
+    int index_and_stage = -1;
+    for (int i = 0; i < nodes.size(); i++) {
+      if ((i % shard_position) == 0 && index_and_stage < num_ipus - 1) {
+        index_and_stage++;
+      }
+      nodes[i]->Op()->SetAttr("ipu_index", index_and_stage);
+      nodes[i]->Op()->SetAttr("ipu_stage", index_and_stage);
+    }
+    VLOG(10) << "end AvgShardPass";
+  }
+
+  VLOG(10) << "leave AvgShardPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(avg_shard_pass, paddle::framework::ir::AvgShardPass);
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..b13acbd198dd524d7e8c14eb03ca1dbc367b257a
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AvgShardPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5dcfddf6187f2b6e79d9f478489110a484cb0575
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ForwardGraphExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter ForwardGraphExtractPass::ApplyImpl";
+
+  std::unordered_map<OpRole, std::unordered_set<ir::Node*>> all_ops{
+      {OpRole::kForward, {}},  {OpRole::kBackward, {}},
+      {OpRole::kOptimize, {}}, {OpRole::kRPC, {}},
+      {OpRole::kDist, {}},     {OpRole::kLRSched, {}},
+      {OpRole::kLoss, {}},     {OpRole::kNotSpecified, {}}};
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto op_role = BOOST_GET_MUTABLE(int, node->Op()->GetAttr("op_role"));
+    if (op_role == static_cast<int>(OpRole::kForward)) {
+      all_ops[OpRole::kForward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kBackward)) {
+      all_ops[OpRole::kBackward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kOptimize)) {
+      all_ops[OpRole::kOptimize].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kRPC)) {
+    } else if (op_role == static_cast<int>(OpRole::kDist)) {
+    } else if (op_role == static_cast<int>(OpRole::kLRSched)) {
+    } else if (op_role == static_cast<int>(OpRole::kLoss)) {
+      all_ops[OpRole::kLoss].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kNotSpecified)) {
+      LOG(WARNING) << "Op: " << node->Name() << " OpRole is NotSpecified ";
+    }
+  }
+
+  std::unordered_set<ir::Node*> forward_vars;
+  std::unordered_set<ir::Node*> backward_vars;
+  std::unordered_set<ir::Node*> control_vars;
+  // forward_vars
+  for (auto& nodes : std::array<std::unordered_set<ir::Node*>, 2>{
+           all_ops[OpRole::kForward], all_ops[OpRole::kLoss]}) {
+    for (auto* node : nodes) {
+      for (auto* in_node : node->inputs) {
+        forward_vars.insert(in_node);
+      }
+      for (auto* out_node : node->outputs) {
+        forward_vars.insert(out_node);
+      }
+    }
+  }
+  // control_vars &  backward_vars
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    if (node->IsCtrlVar()) {
+      control_vars.insert(node);
+    }
+    for (auto* in_node : node->inputs) {
+      if (all_ops[OpRole::kOptimize].count(in_node)) {
+        backward_vars.insert(node);
+      }
+    }
+  }
+  // all removed node
+  std::unordered_set<ir::Node*> rm_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (backward_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (control_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kBackward].count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kForward].count(node) == 0 &&
+               all_ops[OpRole::kLoss].count(node) == 0 &&
+               forward_vars.count(node) == 0) {
+      rm_nodes.insert(node);
+    } else if (node->Name() == "feed" || node->Name() == "fetch") {
+      rm_nodes.insert(node);
+    }
+  }
+
+  VLOG(10) << "Remove Node: ";
+  for (auto* node : rm_nodes) {
+    // rm node releations
+    for (auto* node_in : node->inputs) {
+      for (size_t i = 0; i < node_in->outputs.size(); ++i) {
+        if (node_in->outputs[i] == node) {
+          node_in->outputs.erase(node_in->outputs.begin() + i);
+          break;
+        }
+      }
+    }
+    for (auto* node_out : node->outputs) {
+      for (size_t i = 0; i < node_out->inputs.size(); ++i) {
+        if (node_out->inputs[i] == node) {
+          node_out->inputs.erase(node_out->inputs.begin() + i);
+          break;
+        }
+      }
+    }
+    VLOG(10) << "\t" << node->Name();
+    graph->RemoveNode(node);
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  VLOG(10) << "leave ForwardGraphExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(forward_graph_extract_pass,
+              paddle::framework::ir::ForwardGraphExtractPass);
diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..afa9f1c15f2ab8c3c27dda9aa3485320725f0ea4
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ForwardGraphExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ceef27ac1ce3c0a8ecd15f86a2dbae098059e0a8
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferShapePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferShapePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  auto batch_size = ipu_backend->GetIpuStrategy()->batch_size;
+
+  auto feed_list = Get<std::vector<std::string>>("feed_list");
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    bool is_feed = std::find(feed_list.begin(), feed_list.end(),
+                             node->Name()) != feed_list.end();
+    if (is_feed) {
+      auto input_shape = node->Var()->GetShape();
+      if (input_shape[0] <= -1) {
+        input_shape[0] = batch_size;
+        node->Var()->SetShape(input_shape);
+      }
+      // int64->int32
+      if (node->Var()->GetDataType() == proto::VarType::INT64) {
+        node->Var()->SetDataType(proto::VarType::INT32);
+      }
+    }
+  }
+
+  // temp scope for shape inference
+  std::shared_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    auto var_desc = node->Var();
+    auto* ptr = scope->Var(var_desc->Name());
+    paddle::framework::InitializeVariable(ptr, var_desc->GetType());
+
+    auto tensor = ptr->GetMutable<paddle::framework::LoDTensor>();
+    tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape()));
+  }
+
+  // infer shape
+  auto nodes = ir::TopologySortOperations(*graph);
+  for (auto node : nodes) {
+    auto op_desc = node->Op();
+    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope);
+    op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx);
+
+    for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) {
+      for (int i = 0; i < it->second.size(); i++) {
+        auto output_name = op_desc->Output(it->first)[i];
+        auto dim =
+            it->second[i]->GetMutable<paddle::framework::LoDTensor>()->dims();
+        auto new_shape = paddle::framework::vectorize(dim);
+        for (auto output_node : node->outputs) {
+          if (output_node->Name() == output_name) {
+            output_node->Var()->SetShape(new_shape);
+          }
+        }
+      }
+    }
+  }
+  // release the temp scope
+  scope.reset();
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave InferShapePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_shape_pass, paddle::framework::ir::InferShapePass)
+    .RequirePassAttr("feed_list");
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e8148b7f066d9e6f3d4f2bd34b1d561bf4293a8
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferShapePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..616139a52ac06c35b252bd723730ab4ac96f4dfc
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferencePostprocessPass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(10) << "enter InferencePostprocessPass::ApplyImpl";
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  auto *feed_var = new paddle::framework::VarDesc("feed");
+  feed_var->SetType(proto::VarType::FEED_MINIBATCH);
+  auto *feed_var_node = graph->CreateVarNode(feed_var);
+
+  auto *fetch_var = new paddle::framework::VarDesc("fetch");
+  fetch_var->SetType(proto::VarType::FETCH_LIST);
+  auto *fetch_var_node = graph->CreateVarNode(fetch_var);
+
+  for (int i = 0; i < feed_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == feed_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("feed");
+        op->SetInput("X", {"feed"});
+        op->SetOutput("Out", {node->Name()});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->inputs.push_back(op_node);
+        op_node->outputs.push_back(node);
+        feed_var_node->outputs.push_back(op_node);
+        op_node->inputs.push_back(feed_var_node);
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < fetch_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == fetch_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("fetch");
+        op->SetInput("X", {node->Name()});
+        op->SetOutput("Out", {"fetch"});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->outputs.push_back(op_node);
+        op_node->inputs.push_back(node);
+        fetch_var_node->inputs.push_back(op_node);
+        op_node->outputs.push_back(fetch_var_node);
+        break;
+      }
+    }
+  }
+
+  VLOG(10) << "leave InferencePostprocessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_postprocess_pass,
+              paddle::framework::ir::InferencePostprocessPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..e80e1905d4ad79c1df1caa4f9e8bd463dfa6654e
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferencePostprocessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d02dcce0cc62c8d96cab1ace860c7fbe913a6e10
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferenceProcessPass::ApplyImpl";
+
+  // Get a new instance of ipu_backend
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetNewInstance();
+
+  // Set scope
+  auto& scope = graph->Get<Scope>(kParamScopeAttr);
+  ipu_backend->SetScope(scope);
+
+  // Set ipu_strategy
+  static std::shared_ptr<platform::ipu::IpuStrategy> ipu_strategy_instance_(
+      new platform::ipu::IpuStrategy());
+  ipu_strategy_instance_->is_training = false;
+  auto num_ipus = graph->Get<int>("num_ipus");
+  ipu_strategy_instance_->num_ipus = num_ipus;
+  if (num_ipus > 1) {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Manual;
+  } else {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Off;
+  }
+
+  auto enable_pipelining = graph->Get<bool>("enable_pipelining");
+  ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining;
+  if (enable_pipelining) {
+    auto batches_per_step = graph->Get<int>("batches_per_step");
+    PADDLE_ENFORCE_GE(
+        batches_per_step, num_ipus,
+        platform::errors::InvalidArgument("Batched per step should be equal or "
+                                          "greater than the number of IPUs"));
+    ipu_strategy_instance_->batches_per_step = batches_per_step;
+  }
+  ipu_strategy_instance_->batch_size = graph->Get<int>("batch_size");
+  ipu_strategy_instance_->need_avg_shard = graph->Get<bool>("need_avg_shard");
+
+  ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
+
+  // Get feed_list and fetch list
+  std::vector<std::string> feed_list = {};
+  std::vector<std::string> fetch_list = {};
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list.push_back("");
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list.push_back("");
+      }
+    }
+  }
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->outputs[0]->Name();
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->inputs[0]->Name();
+      }
+    }
+  }
+
+  // Run passes
+  std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
+                                         "infer_shape_pass", "avg_shard_pass",
+                                         "popart_canonicalization_pass"};
+  std::vector<std::string> compile_pass = {
+      "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
+      "inference_postprocess_pass"};
+  for (auto pass_name : graph_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "infer_shape_pass") {
+      pass->Set("feed_list", new std::vector<std::string>(feed_list.begin(),
+                                                          feed_list.end()));
+    }
+    pass->Apply(graph);
+  }
+
+  for (auto pass_name : compile_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    pass->Set("feed_list",
+              new std::vector<std::string>(feed_list.begin(), feed_list.end()));
+    pass->Set("fetch_list", new std::vector<std::string>(fetch_list.begin(),
+                                                         fetch_list.end()));
+    pass->Apply(graph);
+  }
+
+  VLOG(10) << "leave InferenceProcessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_process_pass,
+              paddle::framework::ir::InferenceProcessPass);
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.h b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..bac0e88377f7c6d8ef124817256c5059345f714e
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferenceProcessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a53466089bc88fe25b3aaed54736f8e78994513
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuGraphBuilderPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  ipu_backend->Compile(graph, feed_list, fetch_list);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuGraphBuilderPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_graph_builder_pass,
+              paddle::framework::ir::IpuGraphBuilderPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..6237df36480335e3971aaffe120ca2374966fcb3
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuGraphBuilderPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3f1f1633ffc947385c2924a9213de289cb3e3d8
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::string GenerateVarName(Node *node) {
+  return node->Name() + "_" + std::to_string(node->id());
+}
+
+void IpuInplacePass::ApplyImpl(ir::Graph *graph) const {
+  // use this pass after forward_graph_extract_pass
+  // raise error if the inplaced var both in feed_list & fetch_list
+  VLOG(10) << "enter IpuInplacePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::map<std::string, int> var_name;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name.find(node->Name()) == var_name.end()) {
+        var_name.emplace(node->Name(), 1);
+      } else {
+        var_name[node->Name()]++;
+      }
+    }
+  }
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name[node->Name()] > 1) {
+        auto is_feed = (std::find(feed_list.begin(), feed_list.end(),
+                                  node->Name()) != feed_list.end()) &&
+                       (node->inputs.size() == 0);
+        auto is_fetch = (std::find(fetch_list.begin(), fetch_list.end(),
+                                   node->Name()) != fetch_list.end()) &&
+                        (node->outputs.size() == 0);
+        if (!is_feed && !is_fetch && !node->Var()->Persistable()) {
+          auto old_name = node->Name();
+          auto new_name = GenerateVarName(node);
+          node->RenameVar(new_name);
+          for (auto *op_in : node->inputs) {
+            op_in->Op()->RenameOutput(old_name, new_name);
+          }
+          for (auto *op_out : node->outputs) {
+            op_out->Op()->RenameInput(old_name, new_name);
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuInplacePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_inplace_pass, paddle::framework::ir::IpuInplacePass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..86756276c8c3dcf6c010ad05b9a39bfa3bda71f5
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuInplacePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ba9233eeb8cb9512740b6ebdf90d07d8fee2d42a
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IPUPassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.h b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..b56d3e4c65b1c0cf466824ae97c351c8f68c1380
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IPUPassBase : public Pass {
+ public:
+  void Init(const std::string& repr, Graph* graph) const;
+  virtual ~IPUPassBase() {}
+
+ protected:
+  mutable Graph* graph_;
+  mutable std::string repr_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3e020714e1db90154b90b3785ea99e9eabd3256
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuRuntimeReplacerPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  framework::OpDesc ipu_rt_op_desc;
+  ipu_rt_op_desc.SetType("ipu_runtime");
+  ipu_rt_op_desc.SetInput("FeedList", feed_list);
+  ipu_rt_op_desc.SetOutput("FetchList", fetch_list);
+  ipu_rt_op_desc.Flush();
+
+  // Create a new node for the ipu_runtime_op.
+  auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      for (auto feed : feed_list) {
+        if (node->Name() == feed) {
+          IR_NODE_LINK_TO(node, ipu_rt_node);
+        }
+      }
+      for (auto fetch : fetch_list) {
+        if (node->Name() == fetch) {
+          IR_NODE_LINK_TO(ipu_rt_node, node);
+        }
+      }
+    }
+  }
+
+  // set ipu_runtime_op dtype attr
+  if (fetch_list.size() == 1) {
+    for (auto* node : graph->Nodes()) {
+      if (node->IsVar()) {
+        for (auto fetch : fetch_list) {
+          if (node->Name() == fetch) {
+            ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType());
+          }
+        }
+      }
+    }
+  }
+
+  // Remove unneeded nodes.
+  std::unordered_set<const Node*> marked_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op_desc = node->Op();
+      if (op_desc->Type() != "ipu_runtime") {
+        marked_nodes.insert(node);
+      }
+    }
+  }
+
+  GraphSafeRemoveNodes(graph, marked_nodes);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuRuntimeReplacerPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_runtime_replacer_pass,
+              paddle::framework::ir::IpuRuntimeReplacerPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba2cc8702fa4731a388986bee8436e6bff31c586
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuRuntimeReplacerPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6be2c775bd2116e11e24d17da225d2c76679399
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      // graph usually have multiple optimizer node for different parameter,
+      // and these node have the same type and attr value usually
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type());
+        VLOG(10) << "found optimizer type: " << node->Op()->Type();
+
+        for (const std::string& attr_name : node->Op()->AttrNames()) {
+          auto attr_type = node->Op()->GetAttrType(attr_name);
+          // with adam, attr are float
+          if (attr_type == proto::AttrType::FLOAT) {
+            auto attr_value =
+                BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name));
+            ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value);
+          } else {
+            VLOG(10) << "Skip " << attr_type;
+          }
+        }
+
+        auto lr_var_name = node->Op()->Input("LearningRate");
+        PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u,
+                          platform::errors::InvalidArgument(
+                              "In op(%s), find input(LearningRate) failed.",
+                              node->Op()->Type()));
+
+        ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]);
+      }
+
+      if ((op_role == static_cast<int>(framework::OpRole::kLoss))) {
+        VLOG(10) << "found loss op type: " << node->Op()->Type();
+        auto outputs = node->Op()->Outputs();
+        PADDLE_ENFORCE_EQ(
+            outputs.size(), 1,
+            platform::errors::InvalidArgument("Can only support one loss key"));
+
+        auto losses_name = outputs.begin()->second;
+        PADDLE_ENFORCE_EQ(losses_name.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "Can only support one loss name"));
+
+        ipu_backend->GetExecutor().SetLoss(losses_name[0]);
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_extract_pass,
+              paddle::framework::ir::IpuOptimizerExtractPass);
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd274ded8f5bd1641bd1eb4e6999a6fa38dca090
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuOptimizerExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c23bfdcb154f16a933c389d9d9032364995bd58c
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using paddle::platform::ipu::IpuBackend;
+using framework::ir::Graph;
+using framework::ir::Node;
+
+void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = IpuBackend::GetInstance();
+  const auto* scope_ = ipu_backend->GetScope();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        auto inputs = node->Op()->Inputs();
+        if (inputs.count(platform::ipu::sBeta1Pow)) {
+          auto var = scope_->GetVar(inputs.at(platform::ipu::sBeta1Pow)[0]);
+          auto data = var->GetMutable<framework::LoDTensor>()->data<float>();
+          auto beta = BOOST_GET_CONST(
+              float, node->Op()->GetAttr(platform::ipu::sBeta1));
+
+          // ensure current save with beta1pow, rather than step.
+          // beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart
+          // support single Step__
+          bool save_with_beta1pow = (data[0] < 1.0f) && (data[0] > 0.0f);
+          float step = 0;
+          float beta_acc = beta;
+          while (beta_acc > data[0] && save_with_beta1pow) {
+            beta_acc *= beta;
+            step += 1;
+          }
+
+          if (save_with_beta1pow) {
+            data[0] = step;
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerStateAlignPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_state_align_pass,
+              paddle::framework::ir::IpuOptimizerStateAlignPass);
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..21a1017d88452aa950949c44a862b78c11bc5793
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * This pass should only affect optimizer that need bias correction,
+ * include Adam/Lamb.
+ */
+
+class IpuOptimizerStateAlignPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c97b7fd5bcb0cb7b6a4a49725d15e23832a9308f
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using platform::ipu::SymbolHandler;
+
+void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto nodes = graph->Nodes();
+  for (auto* node : nodes) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto* op = node->Op();
+    auto op_type = op->Type();
+
+    ir::Node* new_node = nullptr;
+    SymbolHandler handler = platform::ipu::GetHandler(op_type);
+    if (handler) {
+      VLOG(11) << "Raw Paddle Node:";
+      VLOG(11) << node->Op()->Proto()->DebugString();
+      new_node = handler(graph, node);
+      VLOG(11) << "Post Popart Node:";
+      VLOG(11) << new_node->Op()->Proto()->DebugString();
+
+      platform::ipu::ClearNode(node);
+      graph->RemoveNode(node);
+    } else {
+      LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type;
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(popart_canonicalization_pass,
+              paddle::framework::ir::PopartCanonicalizationPass);
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..6690873f2a9ac2532f02b59b241a2858dc978beb
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PopartCanonicalizationPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 523c2161326466eac21e89d9b5442c16138e967a..f3d96c385065643e4571fb53e7ae3ffad8a3241f 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
       LOG(WARNING) << "Pass in op compat failed.";
       return;
     }
+
     const int kNumFields = 5;
     const int kTransOffset = 1;
     const int kTransOutOffset = 2;
     const int kFlattenOffset = 3;
     const int kFlattenOutOffset = 4;
-    std::vector<Node *> nodes;
 
+    std::vector<Node *> nodes;
+    std::vector<int> trans_axis0;
+    int flatten_axis0;
     for (int i = 0; i < times; i++) {
       PADDLE_ENFORCE_NOT_NULL(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
           platform::errors::NotFound("Can not find %s in subgraph.",
                                      input_nodes[i]->name()));
 
+      if (i == 0) {
+        trans_axis0 = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(0)))
+                ->Op()
+                ->GetAttr("axis"));
+        flatten_axis0 = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+      } else {
+        std::vector<int> trans_axis = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))
+                ->Op()
+                ->GetAttr("axis"));
+        // All axis of transpose should be the same
+        if (trans_axis0 != trans_axis) return;
+
+        int flatten_axis = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+        // All axis of flatten should be the same
+        if (flatten_axis0 != flatten_axis) return;
+      }
+
       nodes.push_back(subgraph.at(input_nodes[i]));
       nodes.push_back(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 348ca5b952bfeab364a5b01ec99e4d0381ab4e84..39496cb26776e765f9675e5041cd13104d374c6a 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_IPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, IPU, ::paddle::platform::IPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
 
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index b13166cff60aa31d038b4fe2a5803353c32abf66..8d2ee2f01008bf20d43ddf9242f2bf56effb0db7 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
 cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
 cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)
 
 if (WITH_TESTING)
   cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 368fb4a5fd8c96c03a52249b8243212380bc64ab..0e157ae7d79f38cda25a25be7af8f8a051a3e1ac 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -29,55 +29,32 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
+using GraphHashStrategy = CinnCacheKey::GraphHashStrategy;
+
+CinnCacheKey::CinnCacheKey(GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {}
+
 CinnCacheKey::CinnCacheKey(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
-    const std::string& arch_str) {
+    const std::string& arch_str, GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
   this->SetKey(graph, input_tensors, arch_str);
 }
 
 CinnCacheKey::CinnCacheKey(const ir::Graph& graph,
                            const std::map<std::string, DDim>& input_shapes,
-                           const std::string& arch_str) {
+                           const std::string& arch_str,
+                           GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
   this->SetKey(graph, input_shapes, arch_str);
 }
 
-size_t CinnCacheKey::HashGraph(const ir::Graph& graph) {
-  // using Dot to unqiue graph
-  inference::analysis::Dot dot;
-  std::unordered_map<const ir::Node*, std::string> node2dot;
-  int id = 0;
-  // Create nodes
-  // graph.Nodes() return unordered_set, the same graph may
-  // return different result?
-  for (const ir::Node* n : graph.Nodes()) {
-    std::string node_id = std::to_string(id++);
-    dot.AddNode(node_id, {}, n->Name(), true);
-    node2dot[n] = node_id;
-  }
-
-  // Create edges
-  for (const ir::Node* n : graph.Nodes()) {
-    const auto& src_id = node2dot.at(n);
-    for (auto* out : n->outputs) {
-      const auto& dest_id = node2dot.at(out);
-      dot.AddEdge(src_id, dest_id, {});
-    }
-  }
-
-  const std::string& viz_graph = dot.Build();
-  VLOG(1) << "The hash graph:\n" << viz_graph;
-
-  size_t hash_val = std::hash<std::string>()(viz_graph);
-  VLOG(4) << "The graph's hash value is: " << hash_val;
-  return hash_val;
-}
-
 void CinnCacheKey::SetKey(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
   for (const auto& name_tensor : input_tensors) {
     input_shapes_[name_tensor.first] = name_tensor.second->dims();
   }
@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
 void CinnCacheKey::SetKey(const ir::Graph& graph,
                           const std::map<std::string, DDim>& input_shapes,
                           const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
   input_shapes_ = input_shapes;
   arch_str_ = arch_str;
 }
@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
 }
 
 bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
-  return graph_serialize_str_ == other.graph_serialize_str_ &&
+  return graph_hash_val_ == other.graph_hash_val_ &&
          input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }
 
@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
     ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
   }
 
-  ret = hash_combine(ret, string_hasher(key.graph_serialize_str_));
+  ret = hash_combine(ret, key.graph_hash_val_);
   ret = hash_combine(ret, string_hasher(key.arch_str_));
   return ret;
 }
 
+size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
+  // sort grad node by name and id.
+  auto compare = [](ir::Node* n1, ir::Node* n2) {
+    return (n1->Name() == n2->Name()) ? (n1->id() < n2->id())
+                                      : (n1->Name() < n2->Name());
+  };
+
+  // graph.Nodes() return unordered_set, here using set to avoid the same graph
+  // may return different result
+  std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
+      output_set(compare);
+  node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
+
+  std::string hash_str;
+  for (ir::Node* n : node_set) {
+    hash_str.append(n->Name());
+
+    output_set.clear();
+    output_set.insert(n->outputs.begin(), n->outputs.end());
+    for (auto* out : output_set) {
+      hash_str.append(out->Name());
+    }
+  }
+
+  VLOG(1) << "The hash graph:\n" << hash_str;
+
+  size_t hash_val = std::hash<std::string>()(hash_str);
+  VLOG(4) << "The graph's hash value by graph structure is: " << hash_val;
+  return hash_val;
+}
+
+size_t CinnCacheKeyByAddress::HashGraph(const ir::Graph& graph) {
+  size_t hash_val = reinterpret_cast<size_t>(&graph);
+  VLOG(4) << "The graph's hash value by graph address is: " << hash_val;
+  return hash_val;
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
index 941f8e0cdecc188dde30cda12a6cb59ed48c097d..67325297c4772479781477668d130623a1165afe 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <functional>
 #include <map>
 
 #include "paddle/fluid/framework/ddim.h"
@@ -33,14 +34,18 @@ namespace paddle2cinn {
 // shapes.
 class CinnCacheKey {
  public:
+  using GraphHashStrategy = std::function<size_t(const ir::Graph&)>;
+
+  explicit CinnCacheKey(GraphHashStrategy graph_hash);
+
   CinnCacheKey(const ir::Graph& graph,
                const std::map<std::string, const LoDTensor*>& input_tensors,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
   CinnCacheKey(const ir::Graph& graph,
                const std::map<std::string, DDim>& input_shapes,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
 
-  ~CinnCacheKey() {}
+  ~CinnCacheKey() = default;
 
   void SetKey(const ir::Graph& graph,
               const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -58,13 +63,38 @@ class CinnCacheKey {
   };
 
  private:
-  size_t HashGraph(const ir::Graph& graph);
-
-  std::string graph_serialize_str_;
+  GraphHashStrategy graph_hash_;
+  size_t graph_hash_val_;
   std::map<std::string, DDim> input_shapes_;
   std::string arch_str_;
 };
 
+#define CINN_CACHE_KEY_CREATE(NAME)                                    \
+  class NAME : public CinnCacheKey {                                   \
+   public:                                                             \
+    NAME() : CinnCacheKey(HashGraph) {}                                \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, const LoDTensor*>& input_tensors, \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {}   \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, DDim>& input_shapes,              \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {}    \
+                                                                       \
+   private:                                                            \
+    static size_t HashGraph(const ir::Graph& graph);                   \
+  };
+
+// Class to store the keys by graph address for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByAddress)
+// Class to store the keys by graph structure for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByStructure)
+
+#undef CINN_CACHE_KEY_CREATE
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index f13f44998211f4d2a45088766c175e79f09853fa..f9b48ef4b5ec0cd4ca42635845ea49450727d966 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -26,8 +26,8 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
-TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
-  std::unordered_set<CinnCacheKey, CinnCacheKey::Hash> test_set;
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) {
+  std::unordered_set<CinnCacheKeyByStructure, CinnCacheKey::Hash> test_set;
 
   ProgramDesc empty_program;
   ir::Graph empty_graph(empty_program);
@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   DDim ddim = paddle::framework::make_ddim({1, 2, 3});
   std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
 
-  CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86");
-  CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key1(empty_graph, feed_shapes, "x86");
   EXPECT_EQ(cache_key0, cache_key1);
 
-  CinnCacheKey cache_key2(graph, feed_shapes, "x86");
-  CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu");
-  CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu");
+  CinnCacheKeyByStructure cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByStructure cache_key4(graph, feed_tensors, "nvgpu");
   EXPECT_NE(cache_key2, cache_key3);
   EXPECT_EQ(cache_key3, cache_key4);
 
-  CinnCacheKey cache_key5(empty_graph,
-                          std::map<std::string, const LoDTensor *>(), "unk");
-  CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>(), "unk");
+  CinnCacheKeyByStructure cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByStructure cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                     "unk");
   EXPECT_EQ(cache_key5, cache_key6);
 
   EXPECT_NE(cache_key1, cache_key3);
@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   EXPECT_EQ(test_set.find(cache_key6), test_set.end());
 }
 
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) {
+  std::unordered_set<CinnCacheKeyByAddress, CinnCacheKey::Hash> test_set;
+
+  ProgramDesc empty_program;
+  ir::Graph empty_graph(empty_program);
+
+  ProgramDesc program;
+  auto *global_block = program.MutableBlock(0);
+  auto *x = global_block->Var("X");
+  x->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph(program);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  DDim ddim = paddle::framework::make_ddim({1, 2, 3});
+  std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
+
+  CinnCacheKeyByAddress cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key1(empty_graph, feed_shapes, "x86");
+  EXPECT_EQ(cache_key0, cache_key1);
+
+  CinnCacheKeyByAddress cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByAddress cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByAddress cache_key4(graph, feed_tensors, "nvgpu");
+  EXPECT_NE(cache_key2, cache_key3);
+  EXPECT_EQ(cache_key3, cache_key4);
+
+  CinnCacheKeyByAddress cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByAddress cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                   "unk");
+  EXPECT_EQ(cache_key5, cache_key6);
+
+  EXPECT_NE(cache_key1, cache_key3);
+  EXPECT_NE(cache_key4, cache_key2);
+
+  EXPECT_NE(cache_key3, cache_key5);
+  EXPECT_NE(cache_key6, cache_key4);
+
+  EXPECT_NE(cache_key5, cache_key1);
+  EXPECT_NE(cache_key2, cache_key6);
+
+  test_set.insert(cache_key0);
+  test_set.insert(cache_key1);
+  test_set.insert(cache_key3);
+  test_set.insert(cache_key4);
+  test_set.insert(cache_key5);
+  test_set.insert(cache_key6);
+  EXPECT_EQ(test_set.size(), 3U);
+
+  auto iter = test_set.find(cache_key0);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 2U);
+  EXPECT_EQ(test_set.find(cache_key1), test_set.end());
+
+  iter = test_set.find(cache_key3);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 1U);
+  EXPECT_EQ(test_set.find(cache_key4), test_set.end());
+
+  iter = test_set.find(cache_key5);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 0U);
+  EXPECT_EQ(test_set.find(cache_key6), test_set.end());
+}
+
+TEST(CinnCacheKeyTest, TestSameGraph) {
+  ProgramDesc program1;
+  auto *global_block1 = program1.MutableBlock(0);
+  auto *x1 = global_block1->Var("X");
+  x1->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph1(program1);
+
+  ProgramDesc program2;
+  auto *global_block2 = program2.MutableBlock(0);
+  auto *x2 = global_block2->Var("X");
+  x2->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph2(program2);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  CinnCacheKeyByAddress cache_key_by_address1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key_by_address2(graph2, feed_tensors, "x86");
+  EXPECT_NE(cache_key_by_address1, cache_key_by_address2);
+
+  CinnCacheKeyByStructure cache_key_by_struct1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key_by_struct2(graph2, feed_tensors, "x86");
+  EXPECT_EQ(cache_key_by_struct1, cache_key_by_struct2);
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 7fc8eff3d31c9e8cceabb7f29f63373711f498d2..54167d95899d6fab81a8657b167012b47bf950ea 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -41,6 +41,7 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const Target& target, void* stream) {
   VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
-  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
+  CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors,
+                                           target.arch_str());
+  CinnCacheKeyByStructure cur_key_by_struct;
+
   bool exist = false;
   {
     AutoRDLock r_guard{&rwlock_};
-    exist = cache_.count(cur_key) != 0;
+    exist = cache_by_address_.count(cur_key_by_address) != 0;
+    // if cannot find graph by address, checkout whether the graph structure
+    // have been stored in cache.
+    if (!exist) {
+      // generate the structure cache key
+      cur_key_by_struct.SetKey(graph, input_tensors, target.arch_str());
+
+      // if the graph structure can be found, storing the graph address in
+      // cache for next query.
+      if (cache_by_struct_.count(cur_key_by_struct) != 0) {
+        exist = true;
+        cache_by_address_[cur_key_by_address] =
+            cache_by_struct_.at(cur_key_by_struct).get();
+      }
+    }
   }
   if (!exist) {
     std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
     auto compiled_res =
         CompileGraph(graph, input_tensors, target, compiled_num, stream);
     AutoWRLock w_guard{&rwlock_};
-    if (!cache_.count(cur_key)) {
-      cache_[cur_key] = std::move(compiled_res);
+    if (!cache_by_struct_.count(cur_key_by_struct)) {
+      cache_by_address_[cur_key_by_address] = compiled_res.get();
+      cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
     }
   }
   AutoRDLock guard{&rwlock_};
-  const auto& cached_boj = *cache_[cur_key];
+  const auto& cached_boj = *cache_by_address_[cur_key_by_address];
   return cached_boj;
 }
 
@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
   {
     AutoWRLock guard{&rwlock_};
     graphs_.clear();
-    cache_.clear();
+    cache_by_address_.clear();
+    cache_by_struct_.clear();
   }
   real_compiled_num_.store(0);
 }
@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   *compiled_obj = {std::move(graph_compiler),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(
+          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 71119acf1fb49ec464b752a69a51b999233aea20..5070eb5ce5674dfc5803c61a1eb38117432fb4c1 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -31,6 +31,13 @@
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
+
+namespace operators {
+namespace details {
+class CinnLaunchContext;
+}  // namespace details
+}  // namespace operators
+
 namespace framework {
 namespace paddle2cinn {
 
@@ -39,6 +46,7 @@ struct CinnCompiledObject {
   std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
   std::shared_ptr<::cinn::hlir::framework::Scope> scope;
   std::unordered_map<std::string, std::string> paddle2cinn_varmap;
+  std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
 };
 
 // Entrance to use CINN.
@@ -87,9 +95,12 @@ class CinnCompiler {
       void* stream = nullptr) const;
 
   std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
-  std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
+  std::unordered_map<CinnCacheKeyByAddress, CinnCompiledObject*,
                      CinnCacheKey::Hash>
-      cache_;
+      cache_by_address_;
+  std::unordered_map<CinnCacheKeyByStructure,
+                     std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
+      cache_by_struct_;
   std::atomic_int64_t real_compiled_num_{0};
   mutable RWLock rwlock_;
 
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 55254c65fad59848225af98d9dd327b155f189bb..9521df651f9de92f283a544bb1a86e421f1c3207 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
-                         GetAttrsArgsNames(), GetOutputArgsNames());
+  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
+                         GetInputArgsNames(), GetAttrsArgsNames(),
+                         GetOutputArgsNames());
 }
 
 std::string KernelSignatureToString(const KernelSignature& signature) {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 2e5d2260d12d486c8cda4fa91ebe56a21fa89f2c..3c28260ed278b9119cfc6f9014c9e97e554e5927 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_ipu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  }
+#endif
+
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -386,17 +402,33 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
       VLOG(3) << "Skip copy the same data async from " << src_place << " to "
               << dst_place;
@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     }
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else {  // NOLINT
+  }
+  else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
         platform::errors::Unimplemented("Not supported on place (%s) ", npu));
     // return GetResultHelper(out, npu);
   }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not supported on place (%s) ", ipu));
+  }
 
   bool GetResult(const framework::Tensor& out,
                  const platform::NPUPinnedPlace& cpu) const {
@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
   void VisitorImpl(const platform::XPUPlace& xpu) const {
     PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
   }
+  void VisitorImpl(const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+  }
 
   void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index d95c78c5db8a722e42ac63443f24b887afefcb31..6aad54fba86e481937f0462aef4cbbc35932f023 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
         "is not supported in imperative mode",
         place));
   }
+  // there is NO support in IPUPlace
+  void operator()(const platform::IPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
 
  private:
   int64_t numel_;
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 8875ef74bce14eea9d78434f2a0aa174e8bfa092..54f46e49c4f730904056aa546a0c1d6ce51c7791 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
         op.Type(), outs, dev_ctx->GetPlace());
   }
 
+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
   /**
    * [ Why need handle complex gradient to real gradient? ]
    *
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 068de4f0435bbec8fb83aa9ee8b0cefdd71be06b..9014871229b39bd33771c1a31823923f7dfe49c6 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -211,70 +211,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
-// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
-// If later the operators::StridedMemcpyWithAxis0 is supported,
-// then this specific SplitTensorsForAllReduce can be removed.
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-void SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors) {
-  auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
-  std::vector<framework::Tensor *> outs;
-  std::vector<const framework::Tensor *> shape_refer;
-
-  outs.reserve(p_dense_tensors->size());
-  shape_refer.reserve(p_dense_tensors->size());
-
-  for (auto &tensor : *p_dense_tensors) {
-    outs.emplace_back(&tensor);
-    shape_refer.emplace_back(&tensor);
-  }
-  operators::math::SplitFunctor<platform::NPUDeviceContext, float>
-      split_functor_;
-  split_functor_(context, *in, shape_refer, 0, &outs);
-}
-
-template <>
-void ConcatTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
-    framework::Variable *p_dense_contents,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      ConcatTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, dense_tensors_, p_dense_contents);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it concats tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-
-template <>
-void SplitTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, p_dense_contents, p_dense_tensors);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it splits tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-#endif
-
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 2aed7ec001d2a9e7738dad038f146ee076772da9..c836593f3f409fcef5876c87112cbb119564e51c 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {
 
   const AllocatorMap& GetAllocatorMap() {
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
       auto id = platform::CUDAGraph::CapturingID();
       auto iter = cuda_graph_allocator_map_.find(id);
       PADDLE_ENFORCE_NE(
           iter, cuda_graph_allocator_map_.end(),
           platform::errors::PermissionDenied(
               "No memory pool is prepared for CUDA Graph capturing."));
+      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
       return iter->second->allocators_;
     } else {
       return allocators_;
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
+        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 6de32335c62b220592d8aebc2e9f6051e741fa4c..41dcf277d7a11e6fa1b90e103b148abffa28704a 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+// For Graphcore IPU
+template <>
+void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "IPUPlace, Allocate on cpu.";
+
+  void *p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+template <>
+void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+                              size_t size) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+template <>
+uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Release();
+}
+template <>
+size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
 // For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 90a4ca73399cf55f29398591e9f65aab70dffbda..fa93bf00f2ac0dcd0c3dcb778357dda7d9ce3518 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -32,9 +32,34 @@ CinnLaunchContext::CinnLaunchContext(
       [](const auto& name_view) { return std::string(name_view.data()); });
 }
 
-bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
-  return paddle2cinn_varmap_.count(paddle_name) > 0 &&
-         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
+void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
+                                          const platform::Place& place) {
+  if (std::addressof(scope) == cached_scope_ &&
+      std::addressof(place) == cached_place_) {
+    VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
+            << " are not changed";
+    return;
+  }
+  cached_scope_ = std::addressof(scope);
+  cached_place_ = std::addressof(place);
+  cached_temp_scope_ = scope.NewTmpScope();
+  VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
+          << std::addressof(scope) << ", place:" << cached_place_ << "->"
+          << std::addressof(place);
+}
+
+bool CinnLaunchContext::IsArgumentsInitialized() const {
+  if (hold_buffers_.empty() || name2argument_.empty()) {
+    return false;
+  }
+  return true;
+}
+
+bool CinnLaunchContext::IsVariableUsed(
+    const std::string& paddle_var_name) const {
+  return paddle2cinn_varmap_.count(paddle_var_name) > 0 &&
+         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_var_name)) >
+             0;
 }
 
 CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& var_name) {
@@ -53,99 +78,101 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
   return all_parameters;
 }
 
-void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& paddle_var_name, const LoDTensor& paddle_tensor,
+    const CinnTensor& cinn_tensor) {
   // check dimension
   auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                     platform::errors::PreconditionNotMet(
                         "Tensors' shape in variable(%s) are not equivalent, "
                         "paddle's shape = [%s], but cinn's shape = [%s].",
-                        paddle_name, paddle_tensor.dims(), cinn_dims));
+                        paddle_var_name, paddle_tensor.dims(), cinn_dims));
 
   // TODO(CtfGo): check the underlying data type after CINN ready
 }
 
-void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
-                    platform::errors::InvalidArgument(
-                        "Paddle variable(%s) not used by cinn", paddle_name));
-
-  const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+void CinnLaunchContext::AssignExternalVariable(
+    const std::string& paddle_var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(paddle_var_name), true,
+      platform::errors::InvalidArgument("Paddle variable(%s) not used by cinn",
+                                        paddle_var_name));
+
+  const auto& cinn_var_name = paddle2cinn_varmap_.at(paddle_var_name);
+  const auto& paddle_tensor =
+      cached_scope_->GetVar(paddle_var_name)->Get<LoDTensor>();
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
+  if (paddle_tensor.IsInitialized()) {
+    CheckTensorEquivalent(paddle_var_name, paddle_tensor, cinn_tensor);
   }
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
-                     paddle_tensor);
-}
 
-void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn socpe.", cinn_name));
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
-  }
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
-                     paddle_tensor);
-}
+  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [this, paddle_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_scope_->GetVar(paddle_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
+        return 0;
+      });
 
-std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
-  // convert paddle dimensions array to cinn format
-  std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
-  for (auto i = 0; i < tensor->dims().size(); ++i) {
-    cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
-  }
+  // external variables will be recycled by global gc, so do nothing here
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });
 
+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
+}
+
+void CinnLaunchContext::AssignInternalVariable(
+    const std::string& cinn_var_name) {
+  PADDLE_ENFORCE_GT(
+      cinn_variable_names_.count(cinn_var_name), 0,
+      platform::errors::InvalidArgument("Variable(%s) not found in cinn socpe.",
+                                        cinn_var_name));
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
   auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign size and memory
-  cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
 
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
-        buffer->memory =
-            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->Var(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
         return 0;
       });
 
-  if (free_mem_callback) {
-    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-        [tensor](void* ctx, cinn_buffer_t* buffer) {
-          tensor->clear();
-          return 0;
-        });
-    return cinn_buffer;
-  }
-
+  // internal variables should release its buffer immediately
+  // if no instruction use it
   cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-      [](void* ctx, cinn_buffer_t* buffer) {
-        // Do nothing
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->GetVar(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->clear();
         return 0;
       });
-  return cinn_buffer;
+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
 }
 
-void CinnLaunchContext::SetArgument(const std::string& cinn_name,
-                                    const platform::Place& place,
-                                    bool free_mem_callback,
-                                    LoDTensor* paddle_tensor) {
-  auto buffer =
-      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
-  name2argument_.emplace(cinn_name, buffer.get());
+void CinnLaunchContext::SetArgument(const std::string& cinn_var_name,
+                                    std::unique_ptr<cinn_buffer_t>&& buffer) {
+  VLOG(4) << "SetArgument-" << name2argument_.size() << ": name("
+          << cinn_var_name << "), dims("
+          << framework::DDim(buffer->dims, buffer->dimensions) << ").";
+
+  name2argument_.emplace(cinn_var_name, buffer.get());
   hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
-          << ").";
 }
 
 const std::map<std::string, cinn_pod_value_t>&
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index c990255d68253d048d8c5b3806dbabf80efe746d..7b71d77d8b8860264872b88d86a5cfe7ae82be96 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -24,7 +24,7 @@
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace operators {
@@ -40,16 +40,22 @@ class CinnLaunchContext {
       const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
       const std::shared_ptr<CinnScope>& cinn_scope);
 
+  // explicitly update several environment variables captured
+  // by callback of execution arguments
+  void UpdateCapturedEnv(const framework::Scope& scope,
+                         const platform::Place& place);
+
+  // Return whether execution arguments has been initialized
+  bool IsArgumentsInitialized() const;
+
   // Return whether a Paddle variable used on compiled kernels
-  bool IsVariableUsed(const std::string& var_name);
+  bool IsVariableUsed(const std::string& paddle_var_name) const;
 
   // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& paddle_var_name);
 
   // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& cinn_var_name);
 
   // Extract internal variable names from CinnScope
   // by excluding used input and output variables
@@ -58,10 +64,6 @@ class CinnLaunchContext {
   // Finalize all execution arguments and return them
   const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
 
-  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
-    return std::move(hold_buffers_);
-  }
-
  private:
   // Get CinnTensor with CINN variable name
   CinnTensor GetCinnTensor(const std::string& var_name);
@@ -72,16 +74,15 @@ class CinnLaunchContext {
                              const LoDTensor& paddle_tensor,
                              const CinnTensor& cinn_tensor);
 
-  // Share the buffer of a Paddle tensor to CINN by delivering memory address
-  // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
-      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
-
-  // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, const platform::Place& place,
-                   bool free_mem_callback, LoDTensor* paddle_tensor);
+  // Set an argument with (cinn name)->(cinn_buffer_t) pair
+  void SetArgument(const std::string& cinn_var_name,
+                   std::unique_ptr<cinn_buffer_t>&& buffer);
 
  private:
+  const framework::Scope* cached_scope_ = nullptr;
+  const platform::Place* cached_place_ = nullptr;
+  std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
+
   // a variable name map from paddle to cinn
   const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
   // the variable scope of cinn
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index d922e8355b44c5e15214160f15dec22cf70de719..da7640c3c0f682e7b5364543f7c06d1c6e463e94 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
   return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
 }
 
-TEST(CinnLaunchContextTest, TestIsVariableUsed) {
+TEST(CinnLaunchContextTest, TestBasic) {
   auto launch_context = CreateDefaultLaunchContext();
-
+  // test IsVariableUsed
   ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
   ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-}
-
-TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
-  auto launch_context = CreateDefaultLaunchContext();
-  auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 3);
-  EXPECT_NE(internal_variable_names.find("cinn_var2"),
-            internal_variable_names.end());
+  // test UpdateCapturedEnv
+  platform::CPUPlace place;
+  framework::Scope scope;
+  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
+  // test IsArgumentsInitialized
+  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
 }
 
 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
-  auto launch_context = CreateDefaultLaunchContext();
   platform::CPUPlace place;
   framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
                paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
-  auto launch_context = CreateDefaultLaunchContext();
   platform::CPUPlace place;
   framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
 
   // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
                paddle::platform::EnforceNotMet);
   // not found
-  ASSERT_THROW(
-      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
-      paddle::platform::EnforceNotMet);
+  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
+               paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestSetArgument) {
+  platform::CPUPlace cpu_place;
+  platform::Place place(cpu_place);
+  framework::Scope scope;
   auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
 
-  platform::CPUPlace place;
-  framework::Scope scope;
+  // assign external variables
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
   float* data1 =
       tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
 
-  // assign external variable
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var1", place, tensor1));
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
-  // FinalizeArguments not missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
   auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
   tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var3", place, tensor3));
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
+
+  // FinalizeArguments missed check
+  ASSERT_THROW(launch_context->FinalizeArguments(),
+               paddle::platform::EnforceNotMet);
+  // test get internal variables
+  auto internal_variable_names = launch_context->GetInternalVariableNames();
+  ASSERT_EQ(internal_variable_names.size(), 1);
+  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
 
+  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
+  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
+  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
+
+  // check argument is set correctly and alloc/free callbacks work well
   auto name2argument = launch_context->FinalizeArguments();
   ASSERT_EQ(name2argument.size(), 3);
   ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  // check ShareTensorWithCinnBuffer
+  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
+
   auto* cinn_buffer =
       static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
-
   ASSERT_EQ(cinn_buffer->memory, nullptr);
   cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index 813e7b1152f87eeed18380aaa98cd507d28121b5..ea36a19202ef0696918792210f20dd2c2818e700 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -31,26 +31,6 @@ namespace operators {
 namespace details {
 
 #ifdef PADDLE_WITH_CUDA
-void CUDART_CB ReleaseScope(void* data) {
-  auto* temp_scope = static_cast<framework::Scope*>(data);
-  delete temp_scope;
-}
-
-void CUDART_CB ReleaseBuffers(void* data) {
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
-  delete buffers;
-}
-
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
-}
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 3a272916332beace3b163414d717eb50c79d3a85..170546ed23041237095474830b4d6fd2d11d8783 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();
 
-template <typename DeviceContext>
-void ReleaseResource(const std::vector<void*>& resources, void* stream) {
-  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
-  delete temp_scope;
-  delete buffers;
-}
-
 template <typename DeviceContext>
 void* GetStream(const framework::ExecutionContext& ctx) {
   return nullptr;
 }
 
 #ifdef PADDLE_WITH_CUDA
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream);
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx);
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
         compilation_key, inputs_name2tensor, target, stream);
     details::DebugCinnCompiledResult(cinn_compiled_object);
 
-    auto launch_context = std::make_unique<details::CinnLaunchContext>(
-        cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
-
+    auto* launch_context = cinn_compiled_object.launch_context.get();
     // Step 3. Prepare arguments needed for the compiled executable program.
-    VLOG(4) << "CinnLaunchOp prepare arguments";
-
-    // 3.1 Prepare input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_variable_names) {
-      if (!launch_context->IsVariableUsed(var_name)) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
-        continue;
+    launch_context->UpdateCapturedEnv(scope, place);
+    if (!launch_context->IsArgumentsInitialized()) {
+      VLOG(4) << "CinnLaunchOp prepare arguments";
+
+      // 3.1 Prepare input variables: tensors of input variables have
+      //     been initialized before graph compiled, just check the
+      //     equiality between tensors of paddle and cinn.
+      for (const auto& var_name : input_variable_names) {
+        if (!launch_context->IsVariableUsed(var_name)) {
+          // some input variables don't need for cinn because they are
+          // eliminated by optimized passes or some cinn operators use
+          // less variables
+          VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
+          continue;
+        }
+
+        launch_context->AssignExternalVariable(var_name);
       }
 
-      launch_context->AssignExternalVariable(
-          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
-    }
-
-    // 3.2 Prepare output variables: all output variables should
-    //     be initialized and allocated buffer before
-    //     the runtime program start execution, the compilation result
-    //     includes details of their buffer assginment and we use that to
-    //     allocate space in Paddle. For those variables allocated yet,
-    //     like persistable parameters, just check the equiality between
-    //     Paddle allocation and CINN buffer assginment.
-    auto output_variable_names = ctx.OutputNames(kOutputs);
-    for (const auto var_name : output_variable_names) {
-      PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
-                        platform::errors::InvalidArgument(
-                            "Output variable(%s) not used by cinn", var_name));
-
-      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignExternalVariable(var_name, place, tensor);
-    }
+      // 3.2 Prepare output variables: all output variables should
+      //     be initialized and allocated buffer before
+      //     the runtime program start execution, the compilation result
+      //     includes details of their buffer assginment and we use that to
+      //     allocate space in Paddle. For those variables allocated yet,
+      //     like persistable parameters, just check the equiality between
+      //     Paddle allocation and CINN buffer assginment.
+      auto output_variable_names = ctx.OutputNames(kOutputs);
+      for (const auto var_name : output_variable_names) {
+        PADDLE_ENFORCE_EQ(
+            launch_context->IsVariableUsed(var_name), true,
+            platform::errors::InvalidArgument(
+                "Output variable(%s) not used by cinn", var_name));
+
+        launch_context->AssignExternalVariable(var_name);
+      }
 
-    // 3.3 Prepare internal or temporary variables: Create a temporary
-    //     scope to keep internal variables within graph or temporary
-    //     variables needed by the compiled runtime program in addition.
-    //     Here we directly use the names from CinnScope as Paddle variable
-    //     names, because they will not be used outside the graph
-    //     and should be destructed after computation finished.
-    auto internal_variable_names = launch_context->GetInternalVariableNames();
-    framework::Scope* temp_scope = scope.NewTmpScope().release();
-    for (const auto& var_name : internal_variable_names) {
-      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignInternalVariable(var_name, place, tensor);
+      // 3.3 Prepare internal or temporary variables: Create a temporary
+      //     scope to keep internal variables within graph or temporary
+      //     variables needed by the compiled runtime program in addition.
+      //     Here we directly use the names from CinnScope as Paddle variable
+      //     names, because they will not be used outside the graph
+      //     and should be destructed after computation finished.
+      auto internal_variable_names = launch_context->GetInternalVariableNames();
+      for (const auto& var_name : internal_variable_names) {
+        launch_context->AssignInternalVariable(var_name);
+      }
     }
 
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
     details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
     VLOG(4) << "CinnLaunchOp launch execution done.";
-
-    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
-    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
-        launch_context->HandoverBuffers()};
-    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
-                                            stream);
   }
 };
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 02373c38184fca3722312dc23e92ac51be86d1ea..e10fdf522ff7c54c39a117e05280694f1e92cd5a 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
     scope.Var(test_out_name)->GetMutable<LoDTensor>();
     scope.Var(expected_out_name)->GetMutable<LoDTensor>();
 
-    cinn_launch_op->Run(scope, place);
-    elementwise_add_op->Run(scope, place);
+    platform::Place run_place(place);
+    cinn_launch_op->Run(scope, run_place);
+    elementwise_add_op->Run(scope, run_place);
 
     LoDTensor test_out, expected_out;
     TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2fb21ca4ea75392180539ce8e76db353405ed97a
--- /dev/null
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+class AsComplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_complex");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_complex");
+
+    auto in_dims = ctx->GetInputDim("X");
+    const int input_rank = in_dims.size();
+    PADDLE_ENFORCE_GE(
+        input_rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of input(X) is less than 1. "
+            "Expected the rank of input(X) to be equal to or greater than 1."
+            "But received rank of input(X) = %d",
+            input_rank));
+    const int last_dim_size = in_dims[input_rank - 1];
+    PADDLE_ENFORCE_EQ(
+        last_dim_size, 2,
+        platform::errors::InvalidArgument(
+            "The size of the last dimension of input(X)"
+            "does not equals 2."
+            "Expected the size of last dimension of input(X) to be 2."
+            "But received %d",
+            last_dim_size));
+
+    const framework::DDim out_dims(in_dims.Get(), input_rank - 1);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AsComplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of view_as_complex op.");
+    AddOutput("Out", "(Tensor), The output tensor of view_as_complex op.");
+    AddComment(R"DOC(
+As_complex Operator.
+
+This operator is used to return a complex tensor represented
+by an old-fashioned real tensor. The size of the last dimension of 
+the input tensor should be 2, which corresponds to 'real' and 
+'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsComplexGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_real");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+class AsRealOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_real");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_real");
+
+    auto out_dims_v = framework::vectorize(ctx->GetInputDim("X"));
+    out_dims_v.push_back(2);
+    const framework::DDim out_dims = framework::make_ddim(out_dims_v);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(framework::ToRealType(input_data_type),
+                                   ctx.GetPlace());
+  }
+};
+
+class AsRealOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of as_real op.");
+    AddOutput("Out", "(Tensor), The output tensor of as_real op.");
+    AddComment(R"DOC(
+AsReal Operator.
+
+This operator is used to return an old-fashioned real tensor from a 
+complex tensor. The size of the last dimension of the output tensor is 2,
+which corresponds to 'real' and 'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsRealGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_complex");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(as_complex, ops::AsComplexOp, ops::AsComplexOpMaker,
+                  ops::AsComplexGradMaker<paddle::framework::OpDesc>,
+                  ops::AsComplexGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(as_real, ops::AsRealOp, ops::AsRealOpMaker,
+                  ops::AsRealGradMaker<paddle::framework::OpDesc>,
+                  ops::AsRealGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_complex, ops::AsComplexKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/complex_view_op.cu b/paddle/fluid/operators/complex_view_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..261881cb8d256dcfd5afc465a55b6ba642656e38
--- /dev/null
+++ b/paddle/fluid/operators/complex_view_op.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    as_complex,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/complex_view_op.h b/paddle/fluid/operators/complex_view_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a8d89db4020828cc9fdba90dd99ab7e5395864b
--- /dev/null
+++ b/paddle/fluid/operators/complex_view_op.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AsComplexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<platform::complex<T>>(context.GetPlace());
+
+    // TensorCopy also changes output's shape & dtype
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);  // restored the shape
+    out->mutable_data<platform::complex<T>>(
+        context.GetPlace());  // restore the dtype
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AsRealKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);            // restored the shape
+    out->mutable_data<T>(context.GetPlace());  // restore the dtype
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ipu_runtime_op.cc b/paddle/fluid/operators/ipu_runtime_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b473da00f3318135f194dd90151fbfb39315fee
--- /dev/null
+++ b/paddle/fluid/operators/ipu_runtime_op.cc
@@ -0,0 +1,62 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/ipu_runtime_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IpuRuntimeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FeedList", "FeedList of Graph").AsDuplicable();
+    AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddComment(R"DOC(
+Run graph by PopART runtime.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
+
+REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel<float>,
+                       ops::IpuRuntimeKernel<double>,
+                       ops::IpuRuntimeKernel<int>,
+                       ops::IpuRuntimeKernel<int64_t>,
+                       ops::IpuRuntimeKernel<bool>,
+                       ops::IpuRuntimeKernel<int8_t>,
+                       ops::IpuRuntimeKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/ipu_runtime_op.h b/paddle/fluid/operators/ipu_runtime_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6fc9ae98895d40d2e2d1c9eb02a63d200b0b1f8
--- /dev/null
+++ b/paddle/fluid/operators/ipu_runtime_op.h
@@ -0,0 +1,69 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/framework/ipu/ipu_backend.h"
+#include "paddle/fluid/framework/tensor.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class IpuRuntimeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef PADDLE_WITH_IPU
+    auto ipu_backend = framework::ipu::IpuBackend::GetInstance();
+    if (!ipu_backend->DeviceIsAttached()) {
+      const platform::IPUDeviceContext& ipu_ctx =
+          reinterpret_cast<const platform::IPUDeviceContext&>(
+              ctx.device_context());
+      ipu_backend->AttachDevice(ipu_ctx.DeviceId());
+    }
+
+    auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
+    auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
+    auto output_names = ctx.OutputNames("FetchList");
+    VLOG(4) << "IpuRuntime Kernel, begin to run graph";
+    ipu_backend->Run(inputs, outputs, ctx);
+
+    // post-run
+    // resize tensor when tensor.dims() is empty
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto* out = outputs[i];
+      if (out->dims().size() == 0) {
+        auto tensor_dtype = out->type();
+        auto sizeof_dtype = framework::SizeOfType(tensor_dtype);
+        int64_t dim = out->memory_size() / sizeof_dtype;
+        out->Resize({dim});
+        VLOG(10) << "set ipu_runtime_op output: " << output_names[i]
+                 << " dims from () to: "
+                 << "(" << dim << ")";
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Please compile WITH_IPU option to enable ipu_runtime op"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 4201af18ca7547172017c04d660d7bcfa5b668b1..daa4efa02ac5081e6ddcd0ed45f6e98c826557ba 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
       platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<platform::IPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
index d80faab90b223622ef18b6244325206bb12156bf..6625a4a1a753c9ab5c2a1f735e96d8e2ae99e1dc 100644
--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -54,7 +54,7 @@ class PyLayerOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    auto data_type = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index ed06fac298a8fd2965f93993e796ef1ed1ba0c6e..155eb1ebbe3db3416aa6f14e1e7f6e083bfb54e6 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
       const framework::ExecutionContext &ctx) const override {
     auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
     if (multi_inputs.size() > 0) {
-      return framework::KernelSignature("reshape.mulhost", {"X", "ShapeTensor"},
+      return framework::KernelSignature("reshape_mulhost", {"X", "ShapeTensor"},
                                         {}, {"Out"});
     } else if (ctx.HasInput("Shape")) {
-      return framework::KernelSignature("reshape.host", {"X", "Shape"}, {},
+      return framework::KernelSignature("reshape_host", {"X", "Shape"}, {},
                                         {"Out"});
     } else {
       return framework::KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
index 3970acf82d3ea3f4b3465e6f561160ab03fcac7c..8ee3b118c32f20c5facef51bffa81964981cf33d 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace platform {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
+paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
 
 void CUDAGraph::Reset() {
   if (is_reset_) return;
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
       IsCapturing(), true,
       errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
                                "Graph is capturing."));
+  if (IsThreadLocalCapturing()) {
+    PADDLE_ENFORCE_EQ(IsThisThreadCapturing(), true,
+                      platform::errors::PermissionDenied(
+                          "When capturing CUDA Graph in the thread local mode, "
+                          "you cannot begin segmented capturing in the thread "
+                          "which is not the one that starts the capturing."));
+  }
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
       capturing_graph_->stream_, capturing_graph_->capture_mode_));
   PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
   capturing_graph_->place_ = place;
   capturing_graph_->stream_ = stream;
   capturing_graph_->capture_mode_ = mode;
+  if (mode == cudaStreamCaptureModeThreadLocal) {
+    capturing_thread_id_ = std::this_thread::get_id();
+    VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: "
+             << capturing_thread_id_;
+  }
   BeginSegmentCapture();
 #endif
 }
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
   EndSegmentCapture();
+  capturing_thread_id_ = paddle::none;
   return std::move(capturing_graph_);
 }
 
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index 0856e0fad1900a8f5a7c586f083acd3ec25e480f..ca1e7abb375cb65110fec3b73b6609519052a6bb 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <thread>
 #include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
@@ -26,6 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/utils/optional.h"
 
 namespace paddle {
 namespace platform {
@@ -99,6 +101,25 @@ class CUDAGraph {
   // supported during capturing CUDA Graph.
   static bool IsValidCapturing();
 
+  static bool IsThreadLocalCapturing() {
+#if CUDA_VERSION >= 10010
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
+
  private:
   static CUDAGraphID UniqueID() {
     static std::atomic<CUDAGraphID> id;
@@ -118,6 +139,7 @@ class CUDAGraph {
   bool is_reset_{false};
   std::mutex mtx_;
 
+  static paddle::optional<std::thread::id> capturing_thread_id_;
   static std::unique_ptr<CUDAGraph> capturing_graph_;
 };
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index d443e78ed874f3783c747d45bef60c78b4b31f07..3e070da546b2ae85c40bb0e9cae05cc30d6d22c1 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
   return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
+#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
+  return *reinterpret_cast<float16 *>(&x);
+}
+
+static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
+  return *reinterpret_cast<__half *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  return CUDAFP16ToPDFP16(
+      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
+}
+#else
 CUDA_ATOMIC_WRAPPER(Add, float16) {
   // concrete packed float16 value may exsits in lower or higher 16bits
   // of the 32bits address.
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
   }
 }
 #endif
+#endif
 
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
   float *real = reinterpret_cast<float *>(address);
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index 25629ba74d91523000b673a60e17cc7644ed752d..9be12cbf6d43769d19cf7f0b77d14d5ad02139ee 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,5 +1,5 @@
-# IPU
 IF(WITH_IPU)
+  FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
   cc_library(ipu_device SRCS device.cc DEPS enforce popart)
   cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
   cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5793c4c0e3ca69ec6eb9b7161dd62f95d0ba314a
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) {
+  auto new_node = CreateBaseOp(graph, node, type, {GetInputVarNode("X", node)},
+                               node->outputs);
+  return new_node;
+}
+
+Node *relu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_relu");
+}
+
+Node *tanh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_tanh");
+}
+
+Node *log_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_log");
+}
+
+Node *sigmoid_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sigmoid");
+}
+
+Node *sqrt_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sqrt");
+}
+
+Node *gelu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_gelu_v2");
+}
+
+Node *log_softmax_handler(Graph *graph, Node *node) {
+  auto axis = BOOST_GET_CONST(int, node->Op()->GetAttr("axis"));
+  auto new_softmax = CreateSoftmaxOpset11(graph, node, node->inputs, {}, axis);
+  return CreateBaseOp(graph, node, "popart_log", new_softmax->outputs,
+                      node->outputs);
+}
+
+REGISTER_HANDLER(relu, relu_handler);
+REGISTER_HANDLER(tanh, tanh_handler);
+REGISTER_HANDLER(log, log_handler);
+REGISTER_HANDLER(sigmoid, sigmoid_handler);
+REGISTER_HANDLER(sqrt, sqrt_handler);
+REGISTER_HANDLER(gelu, gelu_handler);
+REGISTER_HANDLER(log_softmax, log_softmax_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d46fc55ec6ce0de12e2a610be1937f9e3a948c02
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// This avoids the static initialisation order fiasco,
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
+  static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
+  return symbol_handlers;
+}
+
+bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
+  if (SymbolHandlers().count(symbol) != 0) {
+    LOG(WARNING) << "Trying to register popart handler twice for operator: "
+                 << symbol;
+    return false;
+  }
+  bool new_handler = SymbolHandlers().emplace(symbol, handler).second;
+  return new_handler;
+}
+
+// Return a pointer to a handler if one is registered for this kind of node or
+// an empty std::function otherwise.
+SymbolHandler GetHandler(const std::string &kind) {
+  auto it = SymbolHandlers().find(kind);
+  if (it != SymbolHandlers().end()) {
+    return it->second;
+  }
+  return {};
+}
+
+void ConnectNodes(Node *first_node, Node *next_node) {
+  first_node->outputs.push_back(next_node);
+  next_node->inputs.push_back(first_node);
+}
+
+void DisConnectNodes(Node *first_node, Node *next_node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  rm_by_value(first_node->outputs, next_node);
+  rm_by_value(next_node->inputs, first_node);
+  rm_by_value(first_node->inputs, next_node);
+  rm_by_value(next_node->outputs, first_node);
+}
+
+void ClearNode(Node *node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  for (auto *node_in : node->inputs) {
+    rm_by_value(node_in->outputs, node);
+  }
+  for (auto *node_out : node->outputs) {
+    rm_by_value(node_out->inputs, node);
+  }
+}
+
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override) {
+  if (new_op->HasAttr(attr_name) && !override) {
+    return;
+  }
+  if (op->HasAttr(attr_name)) {
+    VLOG(10) << "Copying attr: " << attr_name << " from " << op->Type()
+             << " to " << new_op->Type();
+    new_op->SetAttr(attr_name, op->GetAttr(attr_name));
+    new_op->Flush();
+  }
+}
+
+const int VarType2OnnxDtype(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::BOOL:
+      return static_cast<int>(ONNXDataType::BOOL);
+    case framework::proto::VarType::INT16:
+      return static_cast<int>(ONNXDataType::INT16);
+    case framework::proto::VarType::INT32:
+      return static_cast<int>(ONNXDataType::INT32);
+    case framework::proto::VarType::INT64:
+      return static_cast<int>(ONNXDataType::INT64);
+    case framework::proto::VarType::FP16:
+      return static_cast<int>(ONNXDataType::FLOAT16);
+    case framework::proto::VarType::FP32:
+      return static_cast<int>(ONNXDataType::FLOAT);
+    case framework::proto::VarType::FP64:
+      return static_cast<int>(ONNXDataType::DOUBLE);
+    case framework::proto::VarType::UINT8:
+      return static_cast<int>(ONNXDataType::UINT8);
+    case framework::proto::VarType::INT8:
+      return static_cast<int>(ONNXDataType::INT8);
+    case framework::proto::VarType::BF16:
+      return static_cast<int>(ONNXDataType::BFLOAT16);
+    case framework::proto::VarType::COMPLEX64:
+      return static_cast<int>(ONNXDataType::COMPLEX64);
+    case framework::proto::VarType::COMPLEX128:
+      return static_cast<int>(ONNXDataType::COMPLEX128);
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
+  }
+}
+
+const std::string VarType2PopStr(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::UINT8:
+      return "UINT8";
+    case framework::proto::VarType::INT8:
+      return "INT8";
+    case framework::proto::VarType::INT16:
+      return "INT16";
+    case framework::proto::VarType::INT32:
+      return "INT32";
+    case framework::proto::VarType::INT64:
+      return "INT64";
+    case framework::proto::VarType::BOOL:
+      return "BOOL";
+    case framework::proto::VarType::FP64:
+      return "DOUBLE";
+    case framework::proto::VarType::FP32:
+      return "FLOAT";
+    case framework::proto::VarType::FP16:
+      return "FLOAT16";
+    default:
+      PADDLE_THROW(
+          paddle::platform::errors::Unavailable("Unsupported data type."));
+  }
+}
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id) {
+  auto var_name = op_node->Op()->Input(input_name).at(id);
+  return GetInputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id) {
+  auto var_name = op_node->Op()->Output(output_name).at(id);
+  return GetOutputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node) {
+  for (auto *var : op_node->inputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node) {
+  for (auto *var : op_node->outputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+const bool is_float_equal(float a, float b, float eps) {
+  return std::fabs(a - b) <= eps;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1b2bd0c8b5fd454642be8eef733234c89c4d32a
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::OpDesc;
+
+#define REGISTER_HANDLER(name, func) \
+  static bool __UNUSED_##name =      \
+      paddle::platform::ipu::RegisterHandler(#name, func)
+
+using SymbolHandler = std::function<Node *(Graph *, Node *)>;
+
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers();
+
+bool RegisterHandler(const std::string &, const SymbolHandler &);
+
+SymbolHandler GetHandler(const std::string &);
+
+void ConnectNodes(Node *first_node, Node *next_node);
+void DisConnectNodes(Node *first_node, Node *next_node);
+void ClearNode(Node *node);
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override = false);
+
+const int VarType2OnnxDtype(const int type);
+const std::string VarType2PopStr(const int type);
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id = 0);
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id = 0);
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node);
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node);
+
+const bool is_float_equal(float a, float b, float eps = 1e-8);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0c19cac3a6c3f5db76ab70cabb7d49449e030a1
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *elementwise_op_handler(Graph *graph, Node *node,
+                             const std::string &type) {
+  auto *op = node->Op();
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  int64_t x_rank = x_shape.size();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+  int64_t y_rank = y_shape.size();
+
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  if (axis == -1 || axis == x_rank - 1 || x_rank == y_rank) {
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                     node->outputs);
+    return new_node;
+  } else {
+    auto y_new_shape = std::vector<int64_t>(x_rank, 1);
+    for (int i = axis; i < axis + y_rank; ++i) {
+      y_new_shape[i] = y_shape[i - axis];
+    }
+    auto attrs = AttributeMap{
+        {"value", y_new_shape},
+        {"dims", std::vector<int64_t>{x_rank}},
+        {"dtype", ONNXDataType::INT64},
+    };
+    // constant
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    // reshape
+    auto new_node_reshape = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {GetInputVarNode("Y", node), new_node_const->outputs[0]}, {});
+    // elementwise_op
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), new_node_reshape->outputs[0]},
+                     node->outputs);
+    return new_node;
+  }
+}
+
+Node *elementwise_add_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_add");
+}
+
+Node *elementwise_sub_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_sub");
+}
+
+Node *elementwise_div_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_div");
+}
+
+Node *elementwise_mul_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mul");
+}
+
+Node *elementwise_min_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_min");
+}
+
+Node *elementwise_max_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_max");
+}
+
+Node *elementwise_pow_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_pow");
+}
+
+Node *elementwise_mod_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mod");
+}
+
+REGISTER_HANDLER(elementwise_add, elementwise_add_handler);
+REGISTER_HANDLER(elementwise_sub, elementwise_sub_handler);
+REGISTER_HANDLER(elementwise_div, elementwise_div_handler);
+REGISTER_HANDLER(elementwise_mul, elementwise_mul_handler);
+REGISTER_HANDLER(elementwise_min, elementwise_min_handler);
+REGISTER_HANDLER(elementwise_max, elementwise_max_handler);
+REGISTER_HANDLER(elementwise_pow, elementwise_pow_handler);
+REGISTER_HANDLER(elementwise_mod, elementwise_mod_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92362ebf5be7d5426e4149e987c537691caedba4
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *equal_handler(Graph *graph, Node *node) {
+  auto new_node = CreateBaseOp(
+      graph, node, "popart_equal",
+      {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs);
+  return new_node;
+}
+
+REGISTER_HANDLER(equal, equal_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af7e4d0c7dbe9de77357c40f6298d1908495ac1a
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *mean_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_reducemean",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)},
+                      {
+                          {"keepdims", int64_t{0}},
+                      });
+}
+
+Node *pow_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
+    return CreateBaseOp(
+        graph, node, "popart_pow",
+        {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
+        node->outputs);
+  } else {
+    // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
+    auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
+    auto attrs =
+        MakeConstAttrMapFromValue<float>(value_, {1}, ONNXDataType::FLOAT);
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
+                                                    new_node_const->outputs[0]},
+                        node->outputs);
+  }
+}
+
+Node *mul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto x_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("x_num_col_dims"));
+  auto y_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("y_num_col_dims"));
+  auto x_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape_ = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  // build the shape for reshape
+  std::vector<int64_t> reshape_shape_{};
+  for (int left = 0; left < x_num_col_dims; left++) {
+    reshape_shape_.push_back(int64_t(x_shape_[left]));
+  }
+  for (int right = y_num_col_dims; right < y_shape_.size(); right++) {
+    reshape_shape_.push_back(int64_t(y_shape_[right]));
+  }
+  auto x_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("X", node)},
+                   {}, {{"axis", int64_t(x_num_col_dims)}});
+  auto y_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("Y", node)},
+                   {}, {{"axis", int64_t(y_num_col_dims)}});
+  auto matmul =
+      CreateBaseOp(graph, node, "popart_matmul",
+                   {x_flatten->outputs[0], y_flatten->outputs[0]}, {}, {});
+
+  auto reshape_const = CreateConst(
+      graph, node, {}, {},
+      {{"value", reshape_shape_},
+       {"dims", std::vector<int64_t>{int64_t(reshape_shape_.size())}},
+       {"dtype", ONNXDataType::INT64}});
+  return CreateBaseOp(graph, node, "popart_reshape",
+                      {matmul->outputs[0], reshape_const->outputs[0]},
+                      node->outputs, {});
+}
+
+Node *matmul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X"));
+  auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y"));
+  auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha"));
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  int x_rank = x_shape.size();
+  std::vector<int64_t> perm;
+  if (x_rank == 1) {
+    perm = std::vector<int64_t>{0};
+  } else if (x_rank == 2) {
+    return CreateGemm(graph, node,
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      node->outputs, transpose_x, transpose_y, alpha);
+  } else if (x_rank == 3) {
+    perm = std::vector<int64_t>{0, 2, 1};
+  } else if (x_rank == 4) {
+    perm = std::vector<int64_t>{0, 1, 3, 2};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op matmul with input rank == %d", x_rank));
+  }
+
+  Node *x_node = GetInputVarNode("X", node);
+  Node *y_node = GetInputVarNode("Y", node);
+  if (transpose_x) {
+    x_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("X", node)}, {}, {{"perm", perm}});
+    x_node = x_node->outputs[0];
+  }
+  if (transpose_y) {
+    y_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("Y", node)}, {}, {{"perm", perm}});
+    y_node = y_node->outputs[0];
+  }
+  if (is_float_equal(alpha, 1.0)) {
+    auto o_node =
+        CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT);
+    auto const_node = CreateConst(graph, node, {}, {}, attr);
+    return CreateBaseOp(graph, node, "popart_mul",
+                        {o_node->outputs[0], const_node->outputs[0]},
+                        node->outputs);
+  } else {
+    return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
+                        node->outputs);
+  }
+}
+
+Node *sum_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_sum", node->inputs, node->outputs);
+}
+
+Node *softmax_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis);
+}
+
+Node *scale_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
+  auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
+  auto bias_after_scale_ =
+      BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
+  auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
+
+  auto new_node_bias_var =
+      CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{bias_}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", ONNXDataType::FLOAT}});
+  new_node_bias_var = new_node_bias_var->outputs[0];
+
+  Node *new_node_scale_var = nullptr;
+  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+    new_node_scale_var = GetInputVarNode("ScaleTensor", node);
+  } else {
+    new_node_scale_var =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{scale_}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::FLOAT}});
+    new_node_scale_var = new_node_scale_var->outputs[0];
+  }
+
+  // convert to float32
+  auto new_node_cast =
+      CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
+                 static_cast<int>(framework::proto::VarType::FP32));
+  Node *result = nullptr;
+  if (bias_after_scale_) {
+    auto new_node_mul =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_cast->outputs[0], new_node_scale_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_mul->outputs[0], new_node_bias_var}, {}, {});
+  } else {
+    auto new_node_add =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_cast->outputs[0], new_node_bias_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_add->outputs[0], new_node_scale_var}, {}, {});
+  }
+  auto result_after_cast =
+      CreateCast(graph, node, result->outputs, node->outputs,
+                 static_cast<int>(data_type_));
+  return result_after_cast;
+}
+
+Node *cross_entropy2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {},
+                             framework::proto::VarType::INT32);
+  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
+  if (label_shape_.size() == 1) {
+    return CreateBaseOp(graph, node, "popart_nllloss",
+                        {GetInputVarNode("X", node), new_cast->outputs[0]},
+                        {GetOutputVarNode("Y", node)},
+                        {
+                            {"ignoreIndex", ignoreIndex},
+                        });
+  } else {
+    std::vector<int64_t> new_shape_{label_shape_[0]};
+    auto const_before_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", new_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_before_loss = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {});
+
+    auto nllloss = CreateBaseOp(
+        graph, node, "popart_nllloss",
+        {GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {},
+        {
+            {"ignoreIndex", ignoreIndex},
+        });
+
+    auto const_after_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", label_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_after_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {nllloss->outputs[0], const_after_loss->outputs[0]},
+                     {GetOutputVarNode("Y", node)}, {});
+    return reshape_after_loss;
+  }
+}
+
+REGISTER_HANDLER(mean, mean_handler);
+REGISTER_HANDLER(pow, pow_handler);
+REGISTER_HANDLER(mul, mul_handler);
+REGISTER_HANDLER(matmul, matmul_handler);
+REGISTER_HANDLER(sum, sum_handler);
+REGISTER_HANDLER(softmax, softmax_handler);
+REGISTER_HANDLER(scale, scale_handler);
+REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58f3e42b7387a778a553cc1187f912b1138d59b3
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -0,0 +1,301 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *conv2d_handler(Graph *graph, Node *node) {
+  OpDesc *op = node->Op();
+  auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
+  auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
+  auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto pads_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  if (pads_.size() == 2) {
+    pads_.push_back(pads_[0]);
+    pads_.push_back(pads_[1]);
+  }
+  auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
+  auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
+  if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+            GetInputVarNode("Bias", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  } else {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  }
+}
+
+Node *batch_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  std::vector<Node *> inputs;
+  inputs.push_back(GetInputVarNode("X", node));
+  inputs.push_back(GetInputVarNode("Scale", node));
+  inputs.push_back(GetInputVarNode("Bias", node));
+  inputs.push_back(GetInputVarNode("Mean", node));
+  inputs.push_back(GetInputVarNode("Variance", node));
+  int64_t num_outputs = 1;
+  std::vector<Node *> outputs;
+  auto is_test_type = op->GetAttrType("is_test");
+  bool is_test;
+  if (is_test_type == 0) {
+    // int
+    is_test = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+  outputs.push_back(GetOutputVarNode("Y", node));
+  if (!is_test) {
+    outputs.push_back(GetOutputVarNode("MeanOut", node));
+    outputs.push_back(GetOutputVarNode("VarianceOut", node));
+    outputs.push_back(GetOutputVarNode("SavedMean", node));
+    outputs.push_back(GetOutputVarNode("SavedVariance", node));
+    num_outputs = 5;
+  }
+  // outputs.push_back(GetOutputVarNode("ReserveSpace", node));
+  auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum"));
+  auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  // data_layout
+  return CreateBaseOp(graph, node, "popart_batchnormalization", inputs, outputs,
+                      {
+                          {"momentum", momentum},
+                          {"epsilon", epsilon},
+                          {"num_outputs", num_outputs},
+                      });
+}
+
+Node *pool2d_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
+  auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+  if (global_pooling) {
+    if (pooling_type == "max") {
+      return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
+                          node->outputs);
+    } else if (pooling_type == "avg") {
+      return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
+                          node->outputs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn pooling_type: %s", pooling_type));
+    }
+  }
+  if (op->HasAttr("padding_algorithm")) {
+    auto padding_algorithm =
+        BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm"));
+    if (padding_algorithm != "EXPLICIT") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn padding_algorithm: %s", padding_algorithm));
+    }
+  }
+
+  auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+  auto kernel_shape = std::vector<int64_t>{ksize.begin(), ksize.end()};
+  auto ceil_mode_ = BOOST_GET_CONST(bool, op->GetAttr("ceil_mode"));
+  auto ceil_mode = int64_t(ceil_mode_ ? 1 : 0);
+  auto paddings = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  auto pads = std::vector<int64_t>{paddings.begin(), paddings.end()};
+  if (pads.size() == 2) {
+    pads.push_back(paddings[0]);
+    pads.push_back(paddings[1]);
+  }
+  auto strides_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto strides = std::vector<int64_t>{strides_.begin(), strides_.end()};
+  if (pooling_type == "max") {
+    int64_t num_outputs = 1;
+    auto dilations = std::vector<int64_t>{};
+    int64_t storage_order = 0;
+    return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
+                        node->outputs, {
+                                           {"num_outputs", num_outputs},
+                                           {"kernel_shape", kernel_shape},
+                                           {"ceil_mode", ceil_mode},
+                                           {"dilations", dilations},
+                                           {"pads", pads},
+                                           {"storage_order", storage_order},
+                                           {"strides", strides},
+                                       });
+  } else if (pooling_type == "avg") {
+    int64_t count_include_pad = 0;
+    return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
+                        node->outputs,
+                        {
+                            {"kernel_shape", kernel_shape},
+                            {"ceil_mode", ceil_mode},
+                            {"count_include_pad", count_include_pad},
+                            {"pads", pads},
+                            {"strides", strides},
+                        });
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op pool2d with unkonwn pooling_type: %s", pooling_type));
+  }
+}
+
+Node *group_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto groups_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto groups = int64_t{groups_};
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node),
+                                  GetOutputVarNode("Mean", node),
+                                  GetOutputVarNode("Variance", node)};
+  return CreateBaseOp(graph, node, "popart_groupnormalization_v2", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *instance_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node)};
+  return CreateBaseOp(graph, node, "popart_instancenormalization", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *layer_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis"));
+  auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+
+  std::vector<int64_t> norm_shape_{1, 1};
+  for (int i = 0; i < input_shape_.size(); i++) {
+    if (i < begin_norm_axis_) {
+      norm_shape_[0] *= input_shape_[i];
+    } else {
+      norm_shape_[1] *= input_shape_[i];
+    }
+  }
+
+  auto attrs1 = AttributeMap{
+      {"value", norm_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(norm_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape1_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs1);
+  auto new_node_reshape1 = CreateBaseOp(
+      graph, node, "popart_reshape",
+      {GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {});
+
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  int64_t groups_ = 1;
+  auto groupnorm_attrs_ =
+      AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
+  auto out_Y_ = MakeVarNode(graph, node);
+  CreateBaseOp(graph, node, "popart_groupnormalization_v2",
+               {new_node_reshape1->outputs[0], GetInputVarNode("Scale", node),
+                GetInputVarNode("Bias", node)},
+               {out_Y_, GetOutputVarNode("Mean", node),
+                GetOutputVarNode("Variance", node)},
+               groupnorm_attrs_);
+
+  auto attrs2 = AttributeMap{
+      {"value", input_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(input_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape2_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs2);
+  auto new_node_reshape2 = CreateBaseOp(graph, node, "popart_reshape",
+                                        {out_Y_, reshape2_const->outputs[0]},
+                                        {GetOutputVarNode("Y", node)}, {});
+  return new_node_reshape2;
+}
+
+Node *dropout_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto dropout_prob_ = BOOST_GET_CONST(float, op->GetAttr("dropout_prob"));
+  auto dropout_implementation_ =
+      BOOST_GET_CONST(std::string, op->GetAttr("dropout_implementation"));
+  auto is_test_type_ = op->GetAttrType("is_test");
+  bool is_test_;
+  if (is_test_type_ == 0) {
+    // int
+    is_test_ = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test_ = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+
+  if (is_test_) {
+    if (dropout_implementation_ == "upscale_in_train") {
+      return CreateBaseOp(graph, node, "popart_identity",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      auto scale =
+          CreateConst(graph, node, {}, {},
+                      {{"value", std::vector<float>{1 - dropout_prob_}},
+                       {"dims", std::vector<int64_t>{1}},
+                       {"dtype", ONNXDataType::FLOAT}});
+      return CreateBaseOp(graph, node, "popart_mul",
+                          {GetInputVarNode("X", node), scale->outputs[0]},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  } else {
+    if (dropout_implementation_ == "upscale_in_train") {
+      auto attrs_ =
+          AttributeMap{{"num_outputs", (int64_t)1}, {"ratio", dropout_prob_}};
+      return CreateBaseOp(graph, node, "popart_dropout",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, attrs_);
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Do not support downgrade_in_infer with training"));
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  }
+}
+
+REGISTER_HANDLER(pool2d, pool2d_handler);
+REGISTER_HANDLER(batch_norm, batch_norm_handler);
+REGISTER_HANDLER(group_norm, group_norm_handler);
+REGISTER_HANDLER(instance_norm, instance_norm_handler);
+REGISTER_HANDLER(layer_norm, layer_norm_handler);
+REGISTER_HANDLER(conv2d, conv2d_handler);
+REGISTER_HANDLER(dropout, dropout_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b7a3a8ca7c60f59f45dec5a1630fe3fd48a6f868
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// singleton
+static int var_count = 0;
+static int op_count = 0;
+
+const std::string GenerateVarName() {
+  return std::string("_gen_var_") + std::to_string(var_count++);
+}
+
+const std::string GenerateOpName() {
+  return std::string("_gen_op_") + std::to_string(op_count++);
+}
+
+const std::string CreateOpIdentifyId(Node *node) {
+  // format: op_type|out_var0|out_var1|...|_gen_*
+  // this name will be used as op name when exporting onnx model from popart
+  auto op_type = node->Name();
+  std::string op_out = "";
+  for (auto *out_node : node->outputs) {
+    op_out += "|";
+    op_out += out_node->Name();
+  }
+  return {op_type + op_out + "|" + GenerateOpName()};
+}
+
+Node *MakeVarNode(Graph *graph, Node *node) {
+  auto var_name = GenerateVarName();
+  auto var_desc = std::make_unique<framework::VarDesc>(var_name);
+
+  auto var = graph->CreateVarNode(var_desc.get());
+  return var;
+}
+
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs) {
+  auto op_desc = std::make_unique<framework::OpDesc>();
+  op_desc->SetType(type);
+  auto op = graph->CreateOpNode(op_desc.get());
+
+  for (auto *in : inputs) {
+    ConnectNodes(in, op);
+  }
+  if (outputs.empty()) {
+    auto var = MakeVarNode(graph, node);
+    ConnectNodes(op, var);
+  } else {
+    for (auto *out : outputs) {
+      ConnectNodes(op, out);
+    }
+  }
+
+  // i/o
+  std::vector<std::string> input_names;
+  for (auto node : op->inputs) {
+    input_names.push_back(node->Name());
+  }
+  op->Op()->SetInput("__inputs__", input_names);
+  std::vector<std::string> output_names;
+  for (auto node : op->outputs) {
+    output_names.push_back(node->Name());
+  }
+  op->Op()->SetOutput("__outputs__", output_names);
+  op->Op()->Flush();
+
+  return op;
+}
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs) {
+  auto new_node = MakeOpNode(graph, node, type, inputs, outputs);
+  if (!attrs.empty()) {
+    new_node->Op()->SetAttrMap(attrs);
+  }
+  // deal special attr
+  if (!new_node->Op()->HasAttr(sIpuIndexAttr)) {
+    CopyOpAttr(sIpuIndexAttr, node->Op(), new_node->Op());
+  }
+  if (!new_node->Op()->HasAttr(sIpuStageAttr)) {
+    CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op());
+  }
+  {
+    new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
+    new_node->Op()->Flush();
+  }
+
+  return new_node;
+}
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs) {
+  return CreateBaseOp(graph, node, "popart_constant", inputs, outputs, attrs);
+}
+
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype) {
+  auto to = VarType2PopStr(otype);
+  return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
+                      {{"to", to}});
+}
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA,
+                 int64_t transB, float alpha, float beta) {
+  return CreateBaseOp(graph, node, "popart_gemm", inputs, outputs,
+                      {
+                          {"alpha", alpha},
+                          {"beta", beta},
+                          {"transA", transA},
+                          {"transB", transB},
+                      });
+}
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape) {
+  auto attr = AttributeMap{
+      {"value", oshape},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(oshape.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto new_node_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attr);
+  auto new_node_reshape =
+      CreateBaseOp(graph, node, "popart_reshape",
+                   {inputs[0], new_node_const->outputs[0]}, outputs);
+  return new_node_reshape;
+}
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations, int64_t group,
+                 const std::vector<int64_t> &kernel_shape,
+                 const std::vector<int64_t> &pads,
+                 const std::vector<int64_t> &strides) {
+  auto attrs = AttributeMap{
+      {"dilations", dilations},       {"group", group},
+      {"kernel_shape", kernel_shape}, {"pads", pads},
+      {"strides", strides},
+  };
+  return CreateBaseOp(graph, node, "popart_conv", inputs, outputs, attrs);
+}
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis) {
+  PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
+                                          "Softmax op only support one input"));
+  auto x_shape = inputs[0]->Var()->GetShape();
+  int x_rank = x_shape.size();
+  if (axis < 0) {
+    axis = axis + x_rank;
+  }
+  if (axis == x_rank - 1) {
+    return CreateBaseOp(graph, node, "popart_softmax", inputs, outputs,
+                        {{"axis", int64_t{-1}}});
+  } else {
+    auto perm = std::vector<int64_t>(x_rank);
+    std::iota(perm.begin(), perm.end(), 0);
+    perm[x_rank - 1] = axis;
+    perm[axis] = x_rank - 1;
+    auto new_transpose_pre = CreateBaseOp(graph, node, "popart_transpose",
+                                          inputs, {}, {{"perm", perm}});
+    auto new_softmax =
+        CreateBaseOp(graph, node, "popart_softmax", new_transpose_pre->outputs,
+                     {}, {{"axis", int64_t{-1}}});
+    return CreateBaseOp(graph, node, "popart_transpose", new_softmax->outputs,
+                        outputs, {{"perm", perm}});
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e70e56ef9166cc50a5018b0984994391856ab02
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using paddle::framework::AttributeMap;
+
+template <typename T>
+AttributeMap MakeConstAttrMap(std::vector<T> value, std::vector<int64_t> dims,
+                              int dtype) {
+  return AttributeMap{{"value", value}, {"dims", dims}, {"dtype", dtype}};
+}
+
+template <typename T>
+AttributeMap MakeConstAttrMapFromValue(T v, std::vector<int64_t> dims,
+                                       int dtype) {
+  size_t size = 1;
+  for (auto &dim : dims) {
+    size *= dim;
+  }
+  return MakeConstAttrMap<T>(std::vector<T>(size, v), dims, dtype);
+}
+
+const std::string GenerateVarName();
+const std::string CreateOpIdentifyId(Node *node);
+
+Node *MakeVarNode(Graph *graph, Node *node);
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs);
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs = {});
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs);
+
+// otype is proto::VarType::Type
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype);
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA = 0,
+                 int64_t transB = 0, float alpha = 1.0f, float beta = 1.0f);
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape);
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations = {1, 1},
+                 int64_t group = 1,
+                 const std::vector<int64_t> &kernel_shape = {},
+                 const std::vector<int64_t> &pads = {0, 0, 0, 0},
+                 const std::vector<int64_t> &strides = {1, 1});
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
index 4cd7f928f6e22b2aa483c4448f73cd792cd154c6..763c5a46abe287e76a3d7fe32ce9d4c8c0cd2c62 100644
--- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
 OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
 OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
 OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT
+
+// clang-format on
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a0c9ff09460aff63d4b41142b7b37635ac17def5..206bef12aac95e0a111e15afbd1a0533e913e7e9 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#endif
 #include "glog/logging.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
-        "your train process set the correct device id if you use Executor.",
+        "with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check "
+        "that your train process set the correct device id if you use "
+        "Executor.",
         place));
   }
   return it->second.get().get();
@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(
           platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                           "re-compile with WITH_XPU option."));
+#endif
+    } else if (platform::is_ipu_place(p)) {
+#ifdef PADDLE_WITH_IPU
+      EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("IPUPlace is not supported. Please "
+                                          "re-compile with WITH_IPU option."));
 #endif
     } else if (platform::is_npu_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
+#ifdef PADDLE_WITH_IPU
+IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
+  int id = place.GetDeviceId();
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  device_ = ipu_backend->GetDevice(id);
+}
+
+Place IPUDeviceContext::GetPlace() const { return place_; }
+void IPUDeviceContext::Wait() const {
+  /*! \brief  Wait for all operations completion in the stream. */
+}
+
+IPUDeviceContext::~IPUDeviceContext() {}
+
+#endif
 #ifdef PADDLE_WITH_XPU
 XPUDeviceContext::XPUDeviceContext() {
   context_ = xpu::create_context();
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 415babc9cb85e6c93c17dd9fdbf7ef61fc424d4c..ec49134b654e93199c3ef522e5afda6337ff3db8 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
   void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
   void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
+  void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
   return boost::apply_visitor(IsNPUPlace(), p);
 }
 
+bool is_ipu_place(const Place &p) {
+  return boost::apply_visitor(IsIPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
       return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
     } else if (is_npu_place(p1)) {
       return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
+    } else if (is_ipu_place(p1)) {
+      return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 1ab2a62391157b663120d579e64ed1c9f75c43c5..fadc1e27e8a0ac6116b4d99cc6bc4adbdfbd3907 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
   inline bool operator!=(const NPUPinnedPlace &) const { return false; }
   inline bool operator<(const NPUPinnedPlace &) const { return false; }
 };
+struct IPUPlace {
+  IPUPlace() : IPUPlace(0) {}
+  explicit IPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const IPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const IPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const IPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
 
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return true; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return true; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return true; }
+};
+struct IsIPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return true; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
 };
 
 class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace, NPUPinnedPlace> {
+                                    CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> {
  private:
   using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                   CUDAPinnedPlace, NPUPinnedPlace>;
+                                   CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
   Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
   Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
+  Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
+bool is_ipu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool is_npu_pinned_place(const Place &);
@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
     return typename Visitor::result_type();
 #endif
   }
+  typename Visitor::result_type operator()(const IPUPlace &ipu) const {
+#ifdef PADDLE_WITH_IPU
+    return visitor_(ipu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with IPU. Cannot visit ipu device"));
+    return typename Visitor::result_type();
+#endif
+  }
 
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore
index d86562fe73c5748c4cf3e2fdffd0e0888f18ae2d..7b0459e583cd8b5ce4570521c201b183d7c95fcd 100644
--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
@@ -1,2 +1,3 @@
 pybind.h
-op_function_impl.h
\ No newline at end of file
+op_function_impl.h
+eager_op_function_impl.h
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 635f3149773e8f4add54cbd2b9ab4c88583c7695..64bff04375bb429ea69e2c857d1e70aaeb078b67 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -130,6 +130,10 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/ipu_info.h"
+#endif
 
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
@@ -199,6 +203,14 @@ bool IsCompiledWithNPU() {
 #endif
 }
 
+bool IsCompiledWithIPU() {
+#ifndef PADDLE_WITH_IPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -812,6 +824,8 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
@@ -819,7 +833,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1909,6 +1923,58 @@ All parameter, weight, gradient are variables in Paddle.
            [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
       .def("__str__", string::to_string<const platform::NPUPlace &>);
 
+  // IPUPlace
+  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
+    IPUPlace is a descriptor of a device.
+    It represents a IPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+
+          # required: ipu
+
+          ipu_place = paddle.IPUPlace()
+
+        )DOC")
+      .def("__init__",
+           [](platform::IPUPlace &self) {
+#ifdef PADDLE_WITH_IPU
+             if (platform::GetIPUDeviceCount() == 0) {
+               LOG(ERROR) << "Cannot use IPU because there is no IPU "
+                             "detected on your "
+                             "machine.";
+               std::exit(-1);
+             }
+             // use ipu(0) to comile, while run with the number user configure
+             // in sharding and pipline.
+             new (&self) platform::IPUPlace(0);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use IPU because you didn't install IPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use IPU, please try to install IPU version "
+                 "PaddlePaddle by: pip install paddlepaddle*\n"
+                 "If you only have CPU, please change IPUPlace to be "
+                 "CPUPlace().\n");
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
+#ifdef PADDLE_WITH_IPU
+      .def("get_device_id",
+           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::IPUPlace &>);
+
   py::class_<platform::Place> platformplace(m, "Place");
   g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
   platformplace.def(py::init<>())
@@ -1918,6 +1984,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -1927,6 +1994,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) { return platform::is_xpu_place(self); })
       .def("is_npu_place",
            [](platform::Place &self) { return platform::is_npu_place(self); })
+      .def("is_ipu_place",
+           [](platform::Place &self) { return platform::is_ipu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1943,6 +2012,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::NPUPlace, self).device;
            })
+      .def("ipu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::IPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -1966,6 +2039,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self, const platform::NPUPlace &npu_place) {
              self = npu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
+             self = ipu_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
@@ -2197,6 +2274,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
   m.def("is_compiled_with_npu", IsCompiledWithNPU);
+  m.def("is_compiled_with_ipu", IsCompiledWithIPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_cinn", IsCompiledWithCINN);
@@ -2516,6 +2594,10 @@ All parameter, weight, gradient are variables in Paddle.
   });
 #endif
 
+#ifdef PADDLE_WITH_IPU
+  m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
+#endif
+
   py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
       .value("kDefault", platform::TracerOption::kDefault)
       .value("kOpDetail", platform::TracerOption::kOpDetail)
@@ -2593,6 +2675,11 @@ All parameter, weight, gradient are variables in Paddle.
                      bool val) { self.Set<bool>(name, new bool(val)); })
       .def("set", [](ir::Pass &self, const std::string &name,
                      int val) { self.Set<const int>(name, new int(val)); })
+      .def("set",
+           [](ir::Pass &self, const std::string &name,
+              std::vector<std::string> set) {
+             self.Set(name, new std::vector<std::string>(set));
+           })
       .def("set",
            [](ir::Pass &self, const std::string &name,
               std::unordered_set<std::string> set) {
@@ -3425,6 +3512,118 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("device_count", &ParallelExecutor::DeviceCount);
 
+#ifdef PADDLE_WITH_IPU
+  py::class_<platform::ipu::IpuBackend,
+             std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
+      .def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
+      .def("clear", &platform::ipu::IpuBackend::Clear)
+      .def("set_scope", &platform::ipu::IpuBackend::SetScope)
+      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
+
+  py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
+      .def(py::init())
+      .def_property(
+          "num_ipus",
+          [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
+          [](platform::ipu::IpuStrategy &self, int num_ipus) {
+            self.num_ipus = num_ipus;
+          },
+          R"DOC(
+            Int type, set the number ipu we need. Default 1.
+          )DOC")
+      .def_property(
+          "accumulationFactor",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.accumulationFactor;
+          },
+          [](platform::ipu::IpuStrategy &self, int accumulationFactor) {
+            self.popart_options_.accumulationFactor = accumulationFactor;
+          },
+          R"DOC(
+            Specify the number of micro-batches to accumulate before
+            applying the varUpdate. Default 1.
+          )DOC")
+      .def_property("batches_per_step",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.batches_per_step;
+                    },
+                    [](platform::ipu::IpuStrategy &self, int batches_per_step) {
+                      self.batches_per_step = batches_per_step;
+                    },
+                    R"DOC(
+            Int type, set batches_per_step. Default 1.
+          )DOC")
+      .def_property("is_training",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.is_training;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool is_training) {
+                      self.is_training = is_training;
+                    },
+                    R"DOC(
+            Bool type, True for training, False inference. Default True.
+          )DOC")
+      .def_property(
+          "enable_pipelining",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.enablePipelining;
+          },
+          [](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
+            self.popart_options_.enablePipelining = enable_pipelining;
+          },
+          R"DOC(
+            Bool type, True enable pipeline, otherwise disable. Default False.
+          )DOC")
+      .def_property(
+          "enable_manual_shard",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.virtualGraphMode ==
+                   platform::ipu::VirtualGraphMode::Manual;
+          },
+          [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
+            if (enable_ipu_shard) {
+              self.popart_options_.virtualGraphMode =
+                  platform::ipu::VirtualGraphMode::Manual;
+            } else {
+              self.popart_options_.virtualGraphMode =
+                  platform::ipu::VirtualGraphMode::Off;
+            }
+          },
+          R"DOC(
+            Bool type, True enable model sharding, otherwise disable. Default "
+            "False.
+          )DOC")
+      .def_property("need_avg_shard",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.need_avg_shard;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
+                      self.need_avg_shard = need_avg_shard;
+                    },
+                    R"DOC(
+            Bool type, True enable avg shard, otherwise disable. Default False.
+          )DOC")
+      .def_property("batch_size",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.batch_size;
+                    },
+                    [](platform::ipu::IpuStrategy &self, int batch_size) {
+                      self.batch_size = batch_size;
+                    },
+                    R"DOC(
+            Int type, used to make batch size fixed. Default 1.
+          )DOC")
+      .def_property("enable_fp16",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.enable_fp16;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool enable_fp16) {
+                      self.enable_fp16 = enable_fp16;
+                    },
+                    R"DOC(
+            Bool type, True enable float16 mode, otherwise disable. Default False.)DOC");
+#endif
+
   BindFleetWrapper(&m);
   BindIO(&m);
 
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 60b99a964a57fe90454ce4618ee2799aedd697ec..d4fa1b2c89abf3da5075fd1ead0508b720b1f125 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
     "If set true, the queue.pop will only get data from queue but not "
     "remove the data from queue for speed testing");
 
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {
 
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index c5d0afb9a1716ab711c952140f7fa6e0c5c8d62a..935a6437338a751f8966fdb8c804333d8083cac8 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use XPUPlace in CPU/GPU version, "
         "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_ipu_place(place)) {
+#ifdef PADDLE_WITH_IPU
+    if (zero_copy) {
+      auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
+      auto type = framework::ToDataType(std::type_index(typeid(T)));
+      self->ResetHolderWithType(holder, type);
+    } else {
+      auto dst = self->mutable_data<T>(place);
+      std::memcpy(dst, array.data(), array.nbytes());
+    }
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
+        "Please recompile or reinstall Paddle with IPU support."));
 #endif
   } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 96ad9ade8e3ad587a0289ae4b81965abc3a6104f..d1e60c4505d6b0ad76b695940740c7a03b1ede18 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)
 
+if (NOT PYTHON_EXECUTABLE)
+  find_package(PythonInterp REQUIRED)
+endif()
+
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c21094a4af20257cb488a8b74d804dcba2bce87
--- /dev/null
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/kernel_registry.h"
+
+// TODO(chenweihang) After the kernel is split into a single file,
+// the kernel declare statement is automatically generated according to the
+// file name of the kernel, and this header file will be removed
+
+PT_DECLARE_KERNEL(full_like, CPU);
+PT_DECLARE_KERNEL(dot, CPU);
+PT_DECLARE_KERNEL(flatten, CPU);
+PT_DECLARE_KERNEL(sign, CPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(full_like, CUDA);
+PT_DECLARE_KERNEL(dot, CUDA);
+PT_DECLARE_KERNEL(flatten, CUDA);
+PT_DECLARE_KERNEL(sign, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(flatten, XPU);
+#endif
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index e17b19d9f689e18eecbeeb6f0d0730cfde1aa3e5..bfde9b14b0020d9a726bdd1fb6f211274a4e39d2 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -25,10 +25,14 @@ limitations under the License. */
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"
 
-PT_DECLARE_MODULE(UtilsCPU);
+PT_DECLARE_KERNEL(copy, CPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(UtilsCUDA);
+PT_DECLARE_KERNEL(copy, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU);
 #endif
 
 namespace paddle {
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 0310b6e6fafb52db36d1308675478498b873238a..3b8347dec772e93cfe533f4263c3937979025878 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -27,13 +27,13 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"fill_any_like", "full_like"},
     {"fill_constant", "full"},
     {"flatten_contiguous_range", "flatten"},
-    // {"matmul_v2", "matmul"},
+    {"matmul_v2", "matmul"},
     {"reduce_mean", "mean"},
     {"reduce_sum", "sum"},
     {"reshape2", "reshape"},
     // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
     {"flatten", "deprecated"},
-    // {"matmul", "deprecated"},
+    {"matmul", "deprecated"},
     {"mean", "deprecated"},
     {"reshape", "deprecated"},
     {"sum", "deprecated"}};
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
index dbdf90b5bdbf4dafe92c42a3b5ad5201ab8ab8a8..4adfb703503a8d1c92b47e5fc60f7dc2fc1d0d35 100644
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -265,12 +265,8 @@ class KernelFactory {
 
   KernelMap& kernels() { return kernels_; }
 
-  void InsertCompatibleOpType(const std::string& op_type) {
-    compatible_op_types_.insert(op_type);
-  }
-
   bool HasCompatiblePtenKernel(const std::string& op_type) const {
-    return compatible_op_types_.count(TransToPtenKernelName(op_type)) > 0;
+    return kernels_.find(TransToPtenKernelName(op_type)) != kernels_.end();
   }
 
   const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
@@ -288,9 +284,6 @@ class KernelFactory {
   KernelFactory() = default;
 
   KernelMap kernels_;
-  // Used to be compatible with the original execution system and
-  // quickly confirm whether the new kernel can be called
-  std::unordered_set<std::string> compatible_op_types_;
 };
 
 /** operator << overload **/
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index cd6fa80906cfbdaee4514d6e4cfabca78bf0f26e..645e77fc60f8c2533eda3ee355b86a45f3510c23 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cstring>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
@@ -24,6 +25,8 @@
 #include "paddle/pten/core/kernel_factory.h"
 #include "paddle/pten/core/kernel_utils.h"
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace pten {
 
 #define BACKEND(arg__) pten::Backend::arg__
@@ -140,7 +143,6 @@ struct KernelRegistrar {
     Kernel kernel(kernel_fn);
     args_parse_fn(kernel_key, kernel.mutable_args_def());
     args_def_fn(&kernel);
-    KernelFactory::Instance().InsertCompatibleOpType(kernel_name.name());
     KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
   }
 };
@@ -193,64 +195,35 @@ struct KernelRegistrar {
 #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+/** PT_REGISTER_KERNEL
+ *
+ * The most frequently used kernel registration macro, used for kernel
+ * registration with only data type as template parameter, and the function
+ * pointer of the corresponding data type is automatically instantiated
+ * during registration.
+ */
 #define PT_REGISTER_KERNEL(                                       \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL(kernel_name,                                \
-                      PT_ID,                                      \
-                      backend,                                    \
-                      layout,                                     \
-                      meta_kernel_fn,                             \
-                      cpp_dtype,                                  \
-                      __VA_ARGS__)
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
+      pt_register_kernel_ns_check_##kernel_name,                  \
+      "PT_REGISTER_KERNEL must be called in global namespace.");  \
+  _PT_REGISTER_KERNEL(                                            \
+      kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+
 #ifndef _WIN32
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);             \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);    \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
 #else
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
-#endif
-
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
-                           meta_kernel_fn,                      \
-                           cpp_dtype,                           \
-                           __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
-  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
-
 /**
  * `template decltype(fn) fn` can work on gcc and clang,
  * but msvc will failed, error like:
@@ -261,8 +234,30 @@ struct KernelRegistrar {
  *
  *   https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
  *
- * So we solve the explict instantiation of kernel by CMake
+ * And msvc can work without template instantiation
  */
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+#endif
+
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
+                           meta_kernel_fn,                      \
+                           cpp_dtype,                           \
+                           __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
+  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
 
 #define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
@@ -309,22 +304,15 @@ struct KernelRegistrar {
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
   PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
 
-#define PT_KERNEL_REGISTRAR_INIT(kernel_name,                 \
-                                 func_id,                     \
-                                 backend,                     \
-                                 layout,                      \
-                                 args_def_fn,                 \
-                                 meta_kernel_fn,              \
-                                 cpp_dtype,                   \
-                                 ...)                         \
-  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
-                            kernel_name,                      \
-                            func_id,                          \
-                            backend,                          \
-                            layout,                           \
-                            args_def_fn,                      \
-                            meta_kernel_fn,                   \
-                            cpp_dtype,                        \
+#define PT_KERNEL_REGISTRAR_INIT(                                              \
+    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__),                  \
+                            kernel_name,                                       \
+                            backend,                                           \
+                            layout,                                            \
+                            args_def_fn,                                       \
+                            meta_kernel_fn,                                    \
+                            cpp_dtype,                                         \
                             __VA_ARGS__)
 
 // clang-format off
@@ -333,7 +321,6 @@ struct KernelRegistrar {
   and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,              \
                                   kernel_name,    \
-                                  func_id,        \
                                   backend,        \
                                   layout,         \
                                   args_def_fn,    \
@@ -342,7 +329,6 @@ struct KernelRegistrar {
                                   ...)            \
   PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
     kernel_name,                                  \
-    func_id,                                      \
     PT_ID,                                        \
     backend,                                      \
     layout,                                       \
@@ -354,7 +340,6 @@ struct KernelRegistrar {
 // clang-format on
 
 #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -363,17 +348,17 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
       ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }
 #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -382,8 +367,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -392,7 +377,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -400,7 +384,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -409,8 +392,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -419,7 +402,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -427,7 +409,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -436,8 +417,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -446,7 +427,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -454,7 +434,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -463,8 +442,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -473,7 +452,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -481,7 +459,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -490,8 +467,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -500,7 +477,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -508,7 +484,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -517,8 +492,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -527,7 +502,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -535,7 +509,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -544,8 +517,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -554,7 +527,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -562,7 +534,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -571,8 +542,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -581,7 +552,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -589,7 +559,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -598,8 +567,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -608,7 +577,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -616,7 +584,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -625,8 +592,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -635,7 +602,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -643,7 +609,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -652,8 +617,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -662,7 +627,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -670,7 +634,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -679,8 +642,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -689,7 +652,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -697,7 +659,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -706,8 +667,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -716,7 +677,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -724,7 +684,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -733,8 +692,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -743,7 +702,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -751,90 +709,59 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 
-#define PT_REGISTER_KERNEL_STANDARD(                \
-    kernel_name, backend, layout, dtype, kernel_fn) \
-  _PT_REGISTER_KERNEL_STANDARD(                     \
-      kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
-
-#define _PT_REGISTER_KERNEL_STANDARD(                                      \
-    kernel_name, func_id, backend, layout, dtype, kernel_fn)               \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
-  template decltype(kernel_fn) kernel_fn;                                  \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      DATATYPE(dtype),                                                     \
-      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,         \
-      args_def_fn,                                                         \
-      PT_KERNEL(kernel_fn));                                               \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
-
-// use to declare symbol
-#define PT_REGISTER_MODULE(name) \
-  int RegisterSymbolsFor##name() { return 0; }
-
-#define PT_DECLARE_MODULE(name)          \
-  extern int RegisterSymbolsFor##name(); \
-  UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
-
-// only used in cpp tests
-
-#define PT_REGISTER_KERNEL_FOR_TEST(                              \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL_FOR_TEST(kernel_name,                       \
-                               PT_ID,                             \
-                               backend,                           \
-                               layout,                            \
-                               meta_kernel_fn,                    \
-                               cpp_dtype,                         \
-                               __VA_ARGS__)
-
-#define _PT_REGISTER_KERNEL_FOR_TEST(                                      \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id),            \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,            \
-                             func_id)(::pten::Kernel*);                    \
-  PT_KERNEL_REGISTRAR_INIT(                                                \
-      kernel_name,                                                         \
-      func_id,                                                             \
-      backend,                                                             \
-      layout,                                                              \
-      &PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id),         \
-      meta_kernel_fn,                                                      \
-      cpp_dtype,                                                           \
-      __VA_ARGS__);                                                        \
-  void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
-                      func_id)(::pten::Kernel * kernel)
-
-#define PT_REGISTER_KERNEL_WITH_NO_TYPE(          \
-    kernel_name, backend, layout, meta_kernel_fn) \
-  _PT_REGISTER_KERNEL_WITH_NO_TYPE(               \
-      kernel_name, PT_ID, backend, layout, meta_kernel_fn)
-
-#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                                  \
-    kernel_name, func_id, backend, layout, meta_kernel_fn)                 \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  decltype(meta_kernel_fn) meta_kernel_fn;                                 \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      ::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse,    \
-      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),                  \
-      PT_KERNEL(meta_kernel_fn));                                          \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                            \
-                      func_id)(::pten::Kernel * kernel)
+/** PT_REGISTER_SINGLE_KERNEL
+ *
+ * Used to register a single kernel, pass in the complete function pointer
+ * of the kernel, this registration macro will not do automatic template
+ * instantiation.
+ */
+#define PT_REGISTER_SINGLE_KERNEL(                                           \
+    kernel_name, backend, layout, dtype, kernel_fn)                          \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      pt_register_single_kernel_ns_check_##kernel_name,                      \
+      "PT_REGISTER_SINGLE_KERNEL must be called in global namespace.");      \
+  static void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  static const ::pten::KernelRegistrar __reg_pt_single_kernel_##kernel_name( \
+      #kernel_name,                                                          \
+      BACKEND(backend),                                                      \
+      DATALAYOUT(layout),                                                    \
+      DATATYPE(dtype),                                                       \
+      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,           \
+      args_def_fn,                                                           \
+      PT_KERNEL(kernel_fn));                                                 \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }         \
+  void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*)
+
+/** PT_REGISTER_KERNEL_ALL_DTYPE
+ *
+ * Used to register a kernel that supports all data types, such as copy and
+ * reshape that are not sensitive to data types.
+ */
+#define PT_REGISTER_KERNEL_ALL_DTYPE(kernel_name, backend, layout, kernel_fn) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      pt_register_kernel_all_dtype_ns_check_##kernel_name,                    \
+      "PT_REGISTER_KERNEL_ALL_DTYPE must be called in global namespace.");    \
+  static void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(                \
+      ::pten::Kernel*);                                                       \
+  static const ::pten::KernelRegistrar                                        \
+      __reg_pt_kernel_all_dtype_##kernel_name(                                \
+          #kernel_name,                                                       \
+          BACKEND(backend),                                                   \
+          DATALAYOUT(layout),                                                 \
+          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,        \
+          &__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name,                   \
+          PT_KERNEL(kernel_fn));                                              \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }          \
+  void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+
+/** PT_DECLARE_KERNEL
+ *
+ * Used to export the symbols of the file where the kernel is located,
+ * to avoid being removed by linker
+ */
+#define PT_DECLARE_KERNEL(kernel_name, backend)                             \
+  extern int TouchKernelSymbolFor_##kernel_name##_##backend();              \
+  UNUSED static int __declare_kernel_symbol_for_##kernel_name##_##backend = \
+      TouchKernelSymbolFor_##kernel_name##_##backend()
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
index 4f09fc489f8f67a8b2c53af7da16e716f4ad58fc..4175203410f8da092cb2aa7673d5651d17245fbb 100644
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(CreationCPU);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                    CPU,
                    ANY,
                    pten::FillAnyLike,
@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                    CPU,
                    ANY,
                    pten::FillConstant,
diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
index 32411560b55168cc1caa2b95253014aea03eb98a..9f4f1be18259a53a5f945302112b961c757e7036 100644
--- a/paddle/pten/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(LinalgCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                    CPU,
                    ANY,
                    pten::Dot,
@@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot",
                    complex128) {}
 
 PT_REGISTER_KERNEL(
-    "matmul_v2", CPU, ANY, pten::Matmul, float, double, complex64, complex128) {
-}
+    matmul, CPU, ANY, pten::Matmul, float, double, complex64, complex128) {}
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index e0e9cefbf671bf07c375a4452453a8ebef8cd982..61c6cb57a9f780010a5f83dd874cb790fb9879dd 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCPU);
-
 // TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
 // architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+PT_REGISTER_KERNEL(flatten,
                    CPU,
                    ANY,
                    pten::Flatten,
@@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                    int8_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    CPU,
                    ANY,
                    pten::FlattenWithXShape,
@@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid",
                    int8_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("cast",
+
+PT_REGISTER_KERNEL(cast,
                    CPU,
                    ANY,
                    pten::Cast,
@@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast",
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host", CPU, ANY, pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CPU, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CPU, ANY, pten::ReshapeFromDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index ddfb2f5f45854001c6d4d7000d64be23058153b5..2d556d96c2fcf7effa1dbe7f0f31adeded6a403a 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul)
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-
-PT_REGISTER_KERNEL("sign", CPU, ANY, pten::Sign, float, double) {}
-PT_REGISTER_KERNEL("mean", CPU, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CPU, ANY, pten::Sign, float, double) {}
+PT_REGISTER_KERNEL(mean, CPU, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                    CPU,
                    ANY,
                    pten::Scale,
@@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                    CPU,
                    ANY,
                    pten::ElementwiseAdd,
@@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                    CPU,
                    ANY,
                    pten::ElementwiseSub,
@@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                    CPU,
                    ANY,
                    pten::ElementwiseDiv,
@@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                    CPU,
                    ANY,
                    pten::ElementwiseMul,
@@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply",
                    bool,
                    complex64,
                    complex128) {}
-
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                    CPU,
                    ANY,
                    pten::Sum,
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc
index b462ef70c2f06ecda8e21cc32a847f2331be978c..500b4664d638885d8faba393e267c2fb0e556577 100644
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
index 8bc23fb6af056fa3f32b20ec021151b084d7c19d..dd29fd5fbb84d2536ef63c37c8fb0fd59d285258 100644
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(CreationCUDA);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                    CUDA,
                    ANY,
                    pten::FillAnyLike,
@@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                    CUDA,
                    ANY,
                    pten::FillConstant,
diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu
index fe2ac6f184ff7353df5bb55b9b7ff18911d3bf1d..2114bbcc71c75e7edd9d3b71a606d0d394aebd59 100644
--- a/paddle/pten/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/cuda/linalg.cu
@@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(LinalgCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                    CUDA,
                    ANY,
                    pten::Dot,
@@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot",
                    complex64,
                    complex128) {}
 
-PT_REGISTER_KERNEL("matmul_v2",
+PT_REGISTER_KERNEL(matmul,
                    CUDA,
                    ANY,
                    pten::Matmul,
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index acaf1ac2cc62b28002a6c15d564251cfe352f5dc..e668d1b04d7238e6bbcebc7710abb61f885b1659 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCUDA);
-
 using float16 = paddle::platform::float16;
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+
+PT_REGISTER_KERNEL(flatten,
                    CUDA,
                    ANY,
                    pten::Flatten,
@@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                    int8_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    CUDA,
                    ANY,
                    pten::FlattenWithXShape,
@@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid",
                    int64_t) {}
 
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL("cast",                            \
+  PT_REGISTER_KERNEL(cast,                              \
                      CUDA,                              \
                      ANY,                               \
                      pten::Cast,                        \
@@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
 #endif
 
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CUDA, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CUDA, ANY, pten::ReshapeFromDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index 388d42942c10a4afbdca2b4dd5c8ed9a0b9c319d..66aaf14dcd0f62ac2b752d803b136994815963df 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CUDA, ANY, pten::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL(mean, CUDA, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                    CUDA,
                    ANY,
                    pten::Scale,
@@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                    CUDA,
                    ANY,
                    pten::ElementwiseAdd,
@@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                    CUDA,
                    ANY,
                    pten::ElementwiseSub,
@@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                    CUDA,
                    ANY,
                    pten::ElementwiseDiv,
@@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                    CUDA,
                    ANY,
                    pten::ElementwiseMul,
@@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                    CUDA,
                    ANY,
                    pten::Sum,
diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu
index 24da650d1f3eb9664bd69923ba4247d4a341476c..49027e956b2d7dc28b77d72c2db684ae0007a15c 100644
--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
 }
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCUDA);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CUDA, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
index 94fab974aced173b4101f6b8cfc0812f011f92a7..1f1b8ddd5f412d494572a2e9cf154c9c93095b8d 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data,
   }
 }
 
+void TensorCopy(const DenseTensor& src, DenseTensor* dst) {
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  const paddle::platform::CUDADeviceContext* dev_ctx;
+  if (paddle::platform::is_gpu_place(dst->place()) ||
+      paddle::platform::is_npu_place(dst->place())) {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(dst->place()));
+
+  } else {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(src.place()));
+  }
+
+  pten::Copy(*dev_ctx, src, false, dst);
+}
+
 template <typename Tx,
           typename Ty,
           template <typename, typename> class ReduceOp>
@@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   if (config.reduce_num == 1) {
     auto out_dims = y->dims();
     if (x.dtype() == y->dtype()) {
-      pten::Copy(*dev_ctx, x, true, y);
+      TensorCopy(x, y);
       y->Resize(out_dims);
     } else {
       PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
index 5f1c0d42eb5a8faf2d6b488ebded52684abfab3c..f361933cad45a5e703b106588a8b3c5514269e78 100644
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationXPU);
-
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten_contiguous_range",
+PT_REGISTER_KERNEL(flatten,
                    XPU,
                    ANY,
                    pten::Flatten,
@@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    XPU,
                    ANY,
                    pten::FlattenWithXShape,
@@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    int,
                    int64_t) {}
 
-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2",
-                                XPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, XPU, ANY, pten::ReshapeFromVectorVal) {}
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/utils.cc
index 329dc2baf87b58e03906aba0b83fbb2552fd56ca..5c98217f4ec2c41b39729f9df76ea4ffba277286 100644
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsXPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, XPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index b6179f11b1019e26ac248e92a48d8be2971a2fac..227dcc6e9568d7c44730eeab70dca27cace1f482 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -21,12 +21,6 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-PT_DECLARE_MODULE(ManipulationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(ManipulationCUDA);
-#endif
-
 namespace paddle {
 namespace tests {
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 44afeecec32da4d136705fead505c833c273ee09..6ffdd75f72b4ec2754b46feb6039a90874a5b89d 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -156,6 +156,9 @@ from .tensor.manipulation import roll  # noqa: F401
 from .tensor.manipulation import chunk  # noqa: F401
 from .tensor.manipulation import tolist  # noqa: F401
 from .tensor.manipulation import tensordot  # noqa: F401
+from .tensor.manipulation import as_complex  # noqa: F401
+from .tensor.manipulation import as_real  # noqa: F401
+
 from .tensor.math import abs  # noqa: F401
 from .tensor.math import acos  # noqa: F401
 from .tensor.math import asin  # noqa: F401
@@ -227,6 +230,8 @@ from .tensor.math import lgamma  # noqa: F401
 from .tensor.math import lerp  # noqa: F401
 from .tensor.math import rad2deg  # noqa: F401
 from .tensor.math import deg2rad  # noqa: F401
+from .tensor.math import gcd  # noqa: F401
+from .tensor.math import lcm  # noqa: F401
 from .tensor.math import diff  # noqa: F401
 from .tensor.math import angle  # noqa: F401
 
@@ -260,6 +265,7 @@ from .framework.random import set_cuda_rng_state  # noqa: F401
 from .framework import ParamAttr  # noqa: F401
 from .framework import create_parameter  # noqa: F401
 from .framework import CPUPlace  # noqa: F401
+from .framework import IPUPlace  # noqa: F401
 from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
@@ -291,6 +297,7 @@ from .fluid.framework import get_flags  # noqa: F401
 from .fluid.framework import set_flags  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
+from .device import is_compiled_with_ipu  # noqa: F401
 from .device import XPUPlace  # noqa: F401
 
 from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
@@ -478,6 +485,8 @@ __all__ = [  # noqa
            'atan2',
            'rad2deg',
            'deg2rad',
+           'gcd',
+           'lcm',
            'expand',
            'broadcast_to',
            'ones_like',
@@ -553,6 +562,8 @@ __all__ = [  # noqa
            'einsum',
            'set_flags',
            'get_flags',
+           'as_complex',
+           'as_real',
            'diff',
            'angle',
 ]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 95402898589f6ed1095ae4d54674c27d487c3fed..0a11d59d69c94c0345802b0f9d070aa23f4c0a24 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -28,7 +28,9 @@ __all__ = [  # noqa
     'set_device',
     'get_device',
     'XPUPlace',
+    'IPUPlace',
     'is_compiled_with_xpu',
+    'is_compiled_with_ipu',
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
@@ -55,6 +57,36 @@ def is_compiled_with_npu():
     return core.is_compiled_with_npu()
 
 
+def is_compiled_with_ipu():
+    """
+    Whether paddle was built with WITH_IPU=ON to support Graphcore IPU.
+
+    Returns (bool): `True` if IPU is supported, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_ipu = paddle.is_compiled_with_ipu()
+    """
+    return core.is_compiled_with_ipu()
+
+
+def IPUPlace():
+    """
+    Return a Graphcore IPU Place
+
+    Examples:
+        .. code-block:: python
+
+            # required: ipu
+
+            import paddle
+            place = paddle.device.IPUPlace()
+    """
+    return core.IPUPlace()
+
+
 def is_compiled_with_xpu():
     """
     Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
@@ -143,13 +175,19 @@ def _convert_to_place(device):
         selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
         device_id = int(selected_npus[0])
         place = core.NPUPlace(device_id)
+    elif lower_device == 'ipu':
+        if not core.is_compiled_with_ipu():
+            raise ValueError(
+                "The device should not be 'ipu', " \
+                "since PaddlePaddle is not compiled with IPU")
+        place = core.IPUPlace()
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
         avaliable_npu_device = re.match(r'npu:\d+', lower_device)
         if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device:
             raise ValueError(
-                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu' or 'npu:x'"
+                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu"
             )
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
@@ -183,13 +221,13 @@ def _convert_to_place(device):
 
 def set_device(device):
     """
-    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU and NPU.
+    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
     They are represented by string identifiers. This function can specify the global device
     which the OP will run.
 
     Parameters:
         device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x`` and ``npu:x``,
+            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
             where ``x`` is the index of the GPUs, XPUs or NPUs.
 
     Examples:
@@ -236,5 +274,10 @@ def get_device():
     elif isinstance(place, core.NPUPlace):
         device_id = place.get_device_id()
         device = 'npu:' + str(device_id)
+    elif isinstance(place, core.IPUPlace):
+        num_devices = core.get_ipu_device_count()
+        device = "ipus:{{0-{}}}".format(num_devices - 1)
+    else:
+        raise ValueError("The device specification {} is invalid".format(place))
 
     return device
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 786d24052e2f794954e36b9a2df80a0a47416bb3..7bda6a9a28348c0330a60895e46622ca4337dc2f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -296,6 +296,83 @@ class DistributedMatmulImpl0(DistributedOperatorImpl):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        input_dims_mapping = []
+        ordered_input_shard_dims_mapping = []
+
+        for dim in (x_dims_mapping + y_dims_mapping):
+            input_dims_mapping.append(dim)
+
+        for item in input_dims_mapping:
+            if item not in ordered_input_shard_dims_mapping and item != -1:
+                ordered_input_shard_dims_mapping.append(item)
+
+        for mapping in out_dims_mapping:
+            if mapping not in input_dims_mapping:
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            order_index = 0
+            for idx, item in enumerate(out_dims_mapping):
+                if item != -1:
+                    if item != ordered_input_shard_dims_mapping[order_index]:
+                        return False
+                    else:
+                        order_index += 1
+            if order_index != len(ordered_input_shard_dims_mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+        if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
+                1]):
+            return False
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            for mapping in y_dims_mapping[1:]:
+                if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
+                    return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -510,6 +587,95 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if op_desc.attr('transpose_X') or op_desc.attr('transpose_Y'):
+            return False
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        # for gpt2, x dims > y dims, this is a temporary solution
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        x_shard_dim_count = 0
+        x_shard_dims = []
+        y_shard_dim_count = 0
+        y_shard_dims = []
+        for dim in x_dims_mapping:
+            if is_dim_shard(dim):
+                x_shard_dim_count += 1
+                x_shard_dims.append(dim)
+
+        for dim in y_dims_mapping:
+            if is_dim_shard(dim):
+                y_shard_dim_count += 1
+                y_shard_dims.append(dim)
+
+        if not x_shard_dims and not y_shard_dims:
+            return False
+
+        if x_shard_dims[-1] != y_shard_dims[0]:
+            return False
+
+        if x_shard_dim_count == y_shard_dim_count:
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    return False
+            if x_shard_dims != y_shard_dims:
+                return False
+        else:
+            if x_shard_dim_count < y_shard_dim_count:
+                return False
+            output_shard_dims = []
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    output_shard_dims.append(dim)
+            if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
+                return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -710,6 +876,59 @@ class DistributedMatmulImpl2(DistributedOperatorImpl):
 
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping
+        ), "now just support x dims > y dims,but x:{0} and y:{1}".format(
+            x_dims_mapping, y_dims_mapping)
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -777,6 +996,86 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
+            return False
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        input_dims_mapping = []
+        ordered_input_shard_dims_mapping = []
+
+        for dim in (x_dims_mapping + y_dims_mapping):
+            input_dims_mapping.append(dim)
+
+        for item in input_dims_mapping:
+            if item not in ordered_input_shard_dims_mapping and item != -1:
+                ordered_input_shard_dims_mapping.append(item)
+
+        for mapping in out_dims_mapping:
+            if mapping not in input_dims_mapping:
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            order_index = 0
+            for idx, item in enumerate(out_dims_mapping):
+                if item != -1:
+                    if item != ordered_input_shard_dims_mapping[order_index]:
+                        return False
+                    else:
+                        order_index += 1
+            if order_index != len(ordered_input_shard_dims_mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
+                1]):
+            return False
+
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            for mapping in y_dims_mapping[1:]:
+                if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
+                    return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -985,6 +1284,94 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
+            return False
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        x_shard_dim_count = 0
+        x_shard_dims = []
+        y_shard_dim_count = 0
+        y_shard_dims = []
+        for dim in x_dims_mapping:
+            if is_dim_shard(dim):
+                x_shard_dim_count += 1
+                x_shard_dims.append(dim)
+
+        for dim in y_dims_mapping:
+            if is_dim_shard(dim):
+                y_shard_dim_count += 1
+                y_shard_dims.append(dim)
+
+        if not x_shard_dims and not y_shard_dims:
+            return False
+
+        if x_shard_dims[-1] != y_shard_dims[0]:
+            return False
+
+        if x_shard_dim_count == y_shard_dim_count:
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    return False
+            if x_shard_dims != y_shard_dims:
+                return False
+        else:
+            if x_shard_dim_count < y_shard_dim_count:
+                return False
+            output_shard_dims = []
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    output_shard_dims.append(dim)
+            if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
+                return False
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -1183,6 +1570,61 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):
 
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping
+        ), "now just support x dims > y dims,but x:{0} and y:{1}".format(
+            x_dims_mapping, y_dims_mapping)
+
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index ffd24add50a4d01ad3dc28c8e31a15a6b14b9415..dc313c33ee3e287958a8518559e914592af3b912 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -27,11 +27,13 @@ from collections import OrderedDict
 import paddle
 import paddle.fluid as fluid
 from paddle import framework
+from paddle.fluid import core
 import paddle.distributed as dist
 from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm
 
 from ...utils.internal_storage import ParamStorage
-from ...meta_parallel.sharding.sharding_utils import Type
+from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
 
 # CUDA alignment 256 bytes
 alignment = {"gpu": 256, }
@@ -99,16 +101,41 @@ class ShardingOptimizerStage2(Optimizer):
 
         self.broadcast_fp16 = broadcast_fp16
         self.param_storages = {}  # {dtype: {rank: InternalStorage}}
+
+        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
+            logging.warning(
+                "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
+            )
+            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
+                                                      group,
+                                                      paddle.get_device())
+
+        if offload:
+            assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
+
         self.offload = offload  # Using for offload
+        self.offload_device = "cpu"
+
+        self._master_params = {}
 
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self.update_opt_status()
 
     def _generate_master_params(self, trainable_params):
-        for param in trainable_params:
-            if param.dtype == Type.fp16.value:
-                self._optim._master_weights[param.name] = paddle.cast(
-                    param, Type.fp32.value)
+        if self.offload:
+            for param in trainable_params:
+                if param.name not in self._master_params.keys():
+                    self._master_params[param.name] = core.VarBase(
+                        name=param.name,
+                        value=param.cast(dtype=Type.fp32.value).numpy(),
+                        place=core.CPUPlace(),
+                        stop_gradient=param.stop_gradient)
+            self._optim._master_weights = self._master_params
+        else:
+            for param in trainable_params:
+                if param.dtype == Type.fp16.value:
+                    self._optim._master_weights[param.name] = paddle.cast(
+                        param, Type.fp32.value)
 
     def update_opt_status(self):
         """Update optimizer status and parameter storage information, and special functions to be developed.
@@ -243,22 +270,43 @@ class ShardingOptimizerStage2(Optimizer):
         A wrapper for Optimizer's step function to finish the update operation of the optimizer.
         """
 
-        # Synchronize optimizer parameters for the current rank
-        if len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp32.value][self.rank]
-        elif len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank]
+        if self.offload:
+            self._optim._parameter_list = [
+                param for name, param in self._master_params.items()
+            ]
         else:
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank] + self.dtype_rank_params[
+            # Synchronize optimizer parameters for the current rank
+            if len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
                     Type.fp32.value][self.rank]
+            elif len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank]
+            else:
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank] + self.dtype_rank_params[
+                        Type.fp32.value][self.rank]
 
         # Run the optimizer of the current rank step
-        self._optim.step()
+        if self.offload:
+            with device_guard(self.rank, self.offload_device):
+                self._optim.step()
+
+                for param in self._optim._parameter_list:
+                    self._master_params[param.name].set_value(param)
+
+            dev_id = 0 if paddle.get_device() == "cpu" else int(
+                paddle.get_device().split(":")[1])
+
+            for param in self._local_params:
+                if param.name in self._master_params.keys():
+                    param.set_value(self._master_params[param.name].cuda(dev_id)
+                                    .cast(dtype=param.dtype))
+                    self._master_params[param.name].clear_gradient(False)
+        else:
+            self._optim.step()
 
         # Synchronize all the updated shards in between the ranks
         self._broadcast_params()
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 37b85751149f71336ba431e2876e48eeaa85496d..fd49c2a7d658693ea70476e042109aa4473cc414 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -112,6 +112,18 @@ class ShardingStage2(nn.Layer):
         self._has_grad_storage = []
         self._grad_storage_list = []
 
+        # offload
+        # TODO(haohongxiang): Now it's not supported for multi-optimizers using Offload strategy
+        self._offload_optims = list(
+            filter(lambda optim: optim.offload, self._sharding_optimizers))
+        if len(self._offload_optims) > 0:
+            assert len(
+                self._sharding_optimizers
+            ) == 1, "Only support offload strategy for single optimizer"
+
+        self._offload = self._sharding_optimizers[0].offload
+        self._offload_device = "cpu"
+
         # Set backward pass hooks
         self._bw_hooks = []
 
@@ -156,7 +168,8 @@ class ShardingStage2(nn.Layer):
         # Release grad storages
         for dtype in self._grad_storages.keys():
             if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.zero_()
+                if not self._offload:
+                    self._grad_storages[dtype][self._rank].buffer.zero_()
 
         # Release params
         for param in self._trainable_params:
@@ -167,17 +180,24 @@ class ShardingStage2(nn.Layer):
         """
         Before the gradient accumulation, scale the gradient.
         """
-        # Scale grad storages
-        for dtype in self._grad_storages.keys():
-            if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.scale_(
-                    scale=self._world_size_scaling)
-
-        # Scale params
-        for param in self._trainable_params:
-            if param.name in self._param_grads and param.grad is not None:
-                param.grad.scale_(scale=self._world_size_scaling)
-                param._reset_grad_inplace_version(True)
+        if self._offload:
+            for param in self._trainable_params:
+                if param.name in self._sharding_optimizers[
+                        0]._master_params.keys():
+                    self._sharding_optimizers[0]._master_params[
+                        param.name].grad.scale_(scale=self._world_size_scaling)
+        else:
+            # Scale grad storages
+            for dtype in self._grad_storages.keys():
+                if self._rank in self._grad_storages[dtype].keys():
+                    self._grad_storages[dtype][self._rank].buffer.scale_(
+                        scale=self._world_size_scaling)
+
+            # Scale params
+            for param in self._trainable_params:
+                if param.name in self._param_grads and param.grad is not None:
+                    param.grad.scale_(scale=self._world_size_scaling)
+                    param._reset_grad_inplace_version(True)
 
     def _init_internal_storage(self, needs_fresh):
         """
@@ -195,8 +215,14 @@ class ShardingStage2(nn.Layer):
         """
         Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
         """
+        assert isinstance(device, str), "Device must be type str"
         assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync"
 
+        self._layer.to(device=device, dtype=dtype, blocking=blocking)
+
+        # Re-build the buckets, hooks, etc..
+        self._fresh_trainable()
+
     def _fresh_trainable(self):
         """ Whether to update training parameters. """
 
@@ -283,12 +309,17 @@ class ShardingStage2(nn.Layer):
                     self._grad_reduced[index] = False
                     if not self._accumulate_grads:
                         param.grad.scale_(scale=self._world_size_scaling)
-                    param._reset_grad_inplace_version(True)
+                        param._reset_grad_inplace_version(True)
 
                     # Clear the gradient that does not belong to the current rank through the callback function
                     def cleanup():
                         if dst_rank != self._rank:
                             param.clear_gradient(False)
+                        elif self._offload:
+                            self._sharding_optimizers[0]._master_params[
+                                param.name]._copy_gradient_from(param.grad.cpu(
+                                ).cast(dtype=Type.fp32.value))
+                            param.clear_gradient(False)
 
                     # Synchronize the reduce parameter gradient
                     self._tasks_flow.append(
@@ -339,6 +370,15 @@ class ShardingStage2(nn.Layer):
 
                                 grad_storage.buffer.value().get_tensor()._clear(
                                 )
+                            elif self._offload:
+                                grad_storage.to(device=self._offload_device)
+                                for param in grad_storage._params:
+                                    self._sharding_optimizers[0]._master_params[
+                                        param.name]._copy_gradient_from(
+                                            param.grad.cast(
+                                                dtype=Type.fp32.value))
+                                grad_storage.buffer.value().get_tensor()._clear(
+                                )
 
                         # Reduce the bucket
                         grad_storage.sent = True
@@ -478,7 +518,7 @@ class ShardingStage2(nn.Layer):
         # Rebuild fp16/fp32 grad storages
         for dtype in self._grad_storages.keys():
             for dst_rank, grad_storage in self._grad_storages[dtype].items():
-                if dst_rank != self._rank:
+                if self._offload or dst_rank != self._rank:
                     grad_storage.manumal_relase()
                     grad_storage.rebuild()
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index d4c443e385f6fedc6e8ef7cb2cfe3018f9da901e..651bed82396d1dd8e63101b943e459473a43671c 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -17,10 +17,17 @@ import contextlib
 from collections import abc
 from enum import Enum
 from math import inf
+import numpy as np
+from types import MethodType
 
 import paddle
 import paddle.distributed as dist
+from paddle import _C_ops
 from paddle.fluid import core
+from paddle.fluid import layers
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid.dygraph import base as imperative_base
 
 
 class Taskflow:
@@ -41,6 +48,88 @@ class Type(Enum):
     fp32 = paddle.float32
 
 
+class ShardingClipGrad:
+    def __init__(self, clip, group, device):
+        self._clip = clip
+        self._group = group
+        self._device = device
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+
+        sum_square_fp16 = []
+        sum_square_fp32 = []
+
+        for p, g in params_grads:
+            if g is None or getattr(p, 'need_clip', True) is False:
+                continue
+
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.get_tensor_from_selected_rows(
+                    layers.merge_selected_rows(g))
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+
+            if p.dtype == paddle.float16:
+                sum_square_fp16.append(sum_square)
+            elif p.dtype == paddle.float32:
+                sum_square_fp32.append(sum_square)
+
+        # global norm of non-distributed FP16 params_and_grads
+        if len(sum_square_fp16) == 0:
+            global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
+        else:
+            global_norm_fp16 = layers.concat(sum_square_fp16)
+            global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
+            global_norm_fp16 = paddle.cast(
+                global_norm_fp16, dtype=paddle.float32)
+
+        # global norm of non-distributed FP32 params_and_grads
+        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
+            sum_square_fp32) != 0 else paddle.to_tensor(
+                [0.], dtype=paddle.float32)
+        global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
+
+        global_norm_var = global_norm_fp16 + global_norm_fp32
+
+        # add all reduce to get global norm of distributed params_and_grads
+        dev_id = int(self._device.split(":")[1])
+        with device_guard(dev_id, "gpu"):
+            paddle.distributed.all_reduce(global_norm_var, group=self._group)
+
+        global_norm_var = layers.sqrt(global_norm_var)
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
+
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            if p.dtype == paddle.float16:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
+            else:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._dygraph_clip(params_grads)
+
+
 @contextlib.contextmanager
 def device_guard(dev_id, device="cpu"):
     origin_device = paddle.device.get_device()
@@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"):
         yield
     finally:
         paddle.set_device(origin_device)
+
+
+@dygraph_only
+def ShardingScaler(scaler, sharding_group):
+    def unscale_method(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = []
+        param_grads_fp16 = []
+        param_grads_fp32 = []
+
+        if getattr(optimizer, '_param_groups', None) and isinstance(
+                optimizer._param_groups[0], dict):
+
+            for group in optimizer._param_groups:
+                for param in group['params']:
+                    if param._grad_ivar() is not None:
+                        param_grads.append(param._grad_ivar())
+                        if param._grad_ivar(
+                        ).dtype == core.VarDesc.VarType.FP16:
+                            param_grads_fp16.append(param._grad_ivar())
+                        else:
+                            param_grads_fp32.append(param._grad_ivar())
+        else:
+            param_grads = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if param._grad_ivar() is not None
+            ]
+            param_grads_fp16 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
+                           )
+            ]
+            param_grads_fp32 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
+                           )
+            ]
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        if len(param_grads_fp16):
+            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                            param_grads_fp16,
+                                            temp_found_inf_fp16)
+        if len(param_grads_fp32):
+            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                            param_grads_fp32,
+                                            temp_found_inf_fp32)
+
+        self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
+        is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
+
+        paddle.distributed.all_reduce(
+            is_found_inf,
+            op=paddle.distributed.ReduceOp.MAX,
+            group=sharding_group)
+        self._found_inf = is_found_inf.numpy()[0]
+
+    scaler._unscale = MethodType(unscale_method, scaler)
+    return scaler
diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py
index ff41ca217e43bb0a39ff757adc583b3af4162f30..f44b57ede468b27379e194087c02d61c2a731aef 100644
--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
@@ -50,6 +50,29 @@ class InternalStorage:
         else:
             self.buffer = paddle.zeros(size, dtype=dtype)
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it"
+        assert (dtype == Type.fp32.value or
+                Type.fp16.value), "Conversion type is not supported now"
+
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+
+        if self._device != device:
+            tmp_buffer = self.buffer.cuda(
+                dev_id) if device == "gpu" else self.buffer.cpu()
+            for param in self._params:
+                param.clear_gradient(False)
+                param._gradient_set_empty(False)
+            self.buffer.value().get_tensor()._clear()
+            self.buffer = tmp_buffer
+
+        if dtype is not None:
+            self.buffer = self.buffer.cast(dtype=dtype)
+
 
 class ParamStorage(InternalStorage):
     """
@@ -60,6 +83,16 @@ class ParamStorage(InternalStorage):
         super().__init__(size, dtype, device, convert_cpu=True)
         self.param2align = None
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_params()
+
     @fluid.dygraph.no_grad
     def add_rank_params(self, trainable_params, param2align):
         """
@@ -78,7 +111,7 @@ class ParamStorage(InternalStorage):
             p_shape = self._add_param_as_view(param, param2align[param.name])
             cpu_param_shape.append(p_shape)
 
-        # buffer covert from cpu to cuda
+        # buffer convert from cpu to cuda
         dev_id = int(paddle.get_device().split(":")[1])
         self.buffer = self.buffer.cuda(dev_id)
         self._fill = 0
@@ -109,7 +142,8 @@ class ParamStorage(InternalStorage):
         param.stop_gradient = origin_state
 
         # Copy the current param value
-        dev_id = int(paddle.get_device().split(":")[1])
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
         with device_guard(dev_id, "cpu"):
             tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill,
                                                              var_end))
@@ -134,6 +168,18 @@ class ParamStorage(InternalStorage):
 
         self._fill = offset
 
+    @fluid.dygraph.no_grad
+    def _array_params(self):
+        """
+        Given the parameters which have been registered previously, rebuild the whole InternalStorage.
+        """
+        assert len(self._params) > 0
+        assert self.param2align is not None
+
+        self._fill = 0
+        for p in self._params:
+            self._convert_buffer(p, p.shape, self.param2align[p.name])  # modify
+
 
 class GradStorage(InternalStorage):
     """
@@ -171,6 +217,18 @@ class GradStorage(InternalStorage):
             param.shape) + align <= self._max_size and id(
                 param) not in self._param_ids
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        if self._release:
+            self.rebuild()
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_grads()
+
     @fluid.dygraph.no_grad
     def add_grad(self, param, align):
         """
@@ -206,17 +264,25 @@ class GradStorage(InternalStorage):
         """
         Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
         """
-        assert len(self._params) > 0
 
         if self._release:
-            self.buffer = paddle.zeros(
-                [self._max_size], dtype=self._params[0].dtype)
+            self.buffer = paddle.zeros([self._max_size], dtype=self._dtype)
 
             for p in self._params:
                 self._add_grad_as_view(p, self._parm2align[p.name])
 
             self._release = False
 
+    @fluid.dygraph.no_grad
+    def _array_grads(self):
+        """
+        Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
+        """
+        if len(self._params) > 0:
+            self._fill = 0
+            for p in self._params:
+                self._add_grad_as_view(p, self._parm2align[p.name])
+
     @fluid.dygraph.no_grad
     def _add_grad_as_view(self, param, align):
         assert np.prod(
@@ -229,8 +295,17 @@ class GradStorage(InternalStorage):
         assert offset <= np.prod(self.buffer.shape)
 
         # Copy the current grad value to InternalStorage
-        assert self._device == "gpu"
-        tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
-        param._copy_gradient_from(tmp_var)
-        tmp_var.value().get_tensor()._clear()
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+        if self._device == "cpu":
+            with device_guard(dev_id, self._device):
+                tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+                param._copy_gradient_from(tmp_var)
+                tmp_var.value().get_tensor()._clear()
+
+        elif self._device == "gpu":
+            tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+            param._copy_gradient_from(tmp_var)
+            tmp_var.value().get_tensor()._clear()
+
         self._fill = offset
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index e30d3e4c20a92a8deb1edb1dcc2493888adef691..cf198eab1e8e028037afd419a7504ba4321568e7 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -305,7 +305,8 @@ class Uniform(Distribution):
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
+                output_shape, dtype=self.dtype, min=0., max=1.,
+                seed=seed) * (tensor.zeros(
                     output_shape, dtype=self.dtype) + (self.high - self.low))
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5482413dbbc5d10c1829514cf2d76c2236ba51a1..d8ee875e768e524525a67042eb4952df53901f05 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -71,7 +71,7 @@ from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
-from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace
 from .incubate import fleet
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
@@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'CUDAPlace',
         'CUDAPinnedPlace',
         'NPUPlace',
+        'IPUPlace',
         'Tensor',
         'ParamAttr',
         'WeightNormParamAttr',
@@ -197,6 +198,11 @@ def __bootstrap__():
     if os.name == 'nt':
         remove_flag_if_exists('cpu_deterministic')
 
+    if core.is_compiled_with_ipu():
+        # Currently we request all ipu available for training and testing
+        #   finer control of pod of IPUs will be added later
+        read_env_flags += []
+
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     # Note(zhouwei25): sys may not have argv in some cases, 
     # Such as: use Python/C API to call Python from C++
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 125d9fa88d4aedf0e4418c4a109a023c9d5a0e87..5d29dc522b3ef6e008a769b3c899672fe3aa464b 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -484,7 +484,7 @@ class ImperativeQuantizeOutputs(object):
                 model_filename=model_filename,
                 params_filename=params_filename))
 
-        self._gather_scales(infer_program, scope)
+        self._gather_scales(infer_program, scope, fetch_targets)
 
         self._set_skip_quant_attr(infer_program)
 
@@ -520,10 +520,10 @@ class ImperativeQuantizeOutputs(object):
 
         return flag
 
-    def _gather_scales(self, program, scope):
+    def _gather_scales(self, program, scope, fetch_targets):
         """
         Get all scales from fake ops, save them into the corresponding ops
-        and delete all moving_average_abs_max_scale ops. 
+        and delete all moving_average_abs_max_scale ops.
         """
 
         def _gather_input_scale():
@@ -580,6 +580,11 @@ class ImperativeQuantizeOutputs(object):
 
                 for next_op in next_ops:
                     next_op._rename_input(out_var_name, in_var_name)
+                    # If next_op is `fetch` and out_var_name in fetch_targets,
+                    # fetch_targets must update to in_var_name when rename input.
+                    for i in range(len(fetch_targets)):
+                        if fetch_targets[i].name == out_var_name:
+                            fetch_targets[i] = block.var(in_var_name)
 
         _gather_input_scale()
         _gather_output_scale()
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 3e033f70aca38a7c4870031ffb9027ff43fb0810..1ddb9c8e5fa9f4e58d1f9df991f16fc386b053b6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -410,6 +410,23 @@ class PostTrainingQuantization(object):
                for op_type in self._dynamic_quantize_op_type):
             self._collect_dynamic_quantize_op_threshold(
                 self._dynamic_quantize_op_type)
+
+        # Move sub blocks persistable var to global block
+        global_block = self._program.global_block()
+        for _op in global_block.ops:
+            if _op.type == "while":
+                _block_id = _op.attr("sub_block").id
+                _block = self._program.block(_block_id)
+                persistables = []
+                for _name, _var in _block.vars.items():
+                    if _var.persistable:
+                        global_block._clone_variable(_var)
+                        persistables.append(_name)
+                for _name in persistables:
+                    _block._remove_var(_name)
+                persistables.extend(_op.input('X'))
+                _op.desc.set_input("X", persistables)
+
         return self._program
 
     def save_quantized_model(self,
@@ -451,10 +468,6 @@ class PostTrainingQuantization(object):
                                     model_filename=self._model_filename,
                                     params_filename=self._params_filename)
 
-        if self._program.num_blocks > 1:
-            _logger.error("The post training quantization requires that the "
-                          "program only has one block.")
-
         if self._optimize_model:
             self._optimize_fp32_model()
 
@@ -505,23 +518,26 @@ class PostTrainingQuantization(object):
                     self._quantized_act_var_name.add(var_name)
 
         persistable_var_names = _all_persistable_var_names(self._program)
-        for op in self._program.global_block().ops:
-            op_type = op.type
-            if self._is_full_quantize and \
-                op_type not in self._quantizable_op_type:
-                _logger.warning(op_type + " is not supported for quantization.")
-            # For quantized ops, sample inputs and outputs
-            if op_type in self._quantizable_op_type:
-                collect_var_name(
-                    _get_op_input_var_names(op), persistable_var_names, op_type)
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
-            # For other op, only sample output scale
-            elif op_type in self._out_scale_op_list:
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                op_type = op.type
+                if self._is_full_quantize and \
+                    op_type not in self._quantizable_op_type:
+                    _logger.warning(op_type +
+                                    " is not supported for quantization.")
+                # For quantized ops, sample inputs and outputs
+                if op_type in self._quantizable_op_type:
+                    collect_var_name(
+                        _get_op_input_var_names(op), persistable_var_names,
+                        op_type)
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)
+                # For other op, only sample output scale
+                elif op_type in self._out_scale_op_list:
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)
 
     def _set_activation_persistable(self):
         '''
@@ -696,16 +712,17 @@ class PostTrainingQuantization(object):
         '''
         assert self._algo == "min_max", \
             "The algo should be min_max to save input threshold."
-        for op in self._program.global_block().ops:
-            if op.type in self._quantizable_op_type:
-                for var_name in _get_op_input_var_names(op):
-                    assert var_name in self._quantized_var_min
-                    assert var_name in self._quantized_var_max
-                    op._set_attr(var_name + ".min",
-                                 self._quantized_var_min[var_name])
-                    op._set_attr(var_name + ".max",
-                                 self._quantized_var_max[var_name])
-                    op._set_attr("with_quant_attr", True)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in self._quantizable_op_type:
+                    for var_name in _get_op_input_var_names(op):
+                        assert var_name in self._quantized_var_min
+                        assert var_name in self._quantized_var_max
+                        op._set_attr(var_name + ".min",
+                                     self._quantized_var_min[var_name])
+                        op._set_attr(var_name + ".max",
+                                     self._quantized_var_max[var_name])
+                        op._set_attr("with_quant_attr", True)
 
     def _collect_activation_abs_min_max(self):
         '''
@@ -795,7 +812,12 @@ class PostTrainingQuantization(object):
             activation_quantize_type=self._activation_quantize_type,
             weight_quantize_type=self._weight_quantize_type,
             quantizable_op_type=major_quantizable_op_types)
-        transform_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            # Insert fake_quant/fake_dequantize op must in test graph, so
+            # set per graph's _for_test is True.
+            sub_graph._for_test = True
+            transform_pass.apply(sub_graph)
 
         # use AddQuantDequantPass to insert fake_quant_dequant op
         minor_quantizable_op_types = []
@@ -806,7 +828,10 @@ class PostTrainingQuantization(object):
             scope=self._scope,
             place=self._place,
             quantizable_op_type=minor_quantizable_op_types)
-        add_quant_dequant_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            add_quant_dequant_pass.apply(sub_graph)
 
         # save threshold to scale var node
         if self._algo in ["KL", "hist"]:
@@ -836,7 +861,11 @@ class PostTrainingQuantization(object):
             activation_bits=self._activation_bits,
             weight_quantize_type=self._weight_quantize_type,
             quantizable_op_type=major_quantizable_op_types)
-        freeze_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            freeze_pass.apply(sub_graph)
+
         self._program = graph.to_program()
 
     def _save_output_threshold(self):
@@ -888,13 +917,15 @@ class PostTrainingQuantization(object):
                 save_info(op_node, out_var_name, self._quantized_var_max,
                           "out_max", "post_min_max")
 
-        for op in self._program.global_block().ops:
-            if op.type in (self._quantizable_op_type + self._out_scale_op_list):
-                out_var_names = _get_op_output_var_names(op)
-                assert len(out_var_names) == 1, "Post training " + \
-                    "quantization only support one output for " + op.type
-                for var_name in out_var_names:
-                    analysis_and_save_info(op, var_name)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in (
+                        self._quantizable_op_type + self._out_scale_op_list):
+                    out_var_names = _get_op_output_var_names(op)
+                    assert len(out_var_names) == 1, "Post training " + \
+                        "quantization only support one output for " + op.type
+                    for var_name in out_var_names:
+                        analysis_and_save_info(op, var_name)
 
     def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
         """
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 94d7a2ed1534880445bce372fe29cf67d21a7dd7..494ea9697971974d20c917006225df55f531ff70 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -139,6 +139,7 @@ endfunction()
 if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_light_nas)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
@@ -336,6 +337,7 @@ if(NOT WIN32)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
     set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
     set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 120)
     set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120)
     set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3dfd08fccfa3e3bd235a0ffa1bcf22e8e10983
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
@@ -0,0 +1,313 @@
+#   copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+paddle.enable_static()
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        cmd = 'tar xf {0} -C {1}'.format(zip_path, target_folder)
+        os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(self.cache_folder, zip_path)
+        return data_cache_folder
+
+    def run_program(self, model_path, batch_size, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path,
+                    model_filename='model.pdmodel',
+                    params_filename='model.pdiparams', executor=exe)
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
+
+        img_shape = [1, 28, 28]
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            input_label = np.array([x[1] for x in data]).astype("int64")
+
+            t1 = time.time()
+            out = exe.run(infer_program,
+                          feed={feed_dict[0]: image},
+                          fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            out_label = np.argmax(np.array(out[0]), axis=1)
+            top1_num = sum(input_label == out_label)
+            test_info.append(top1_num)
+            cnt += len(data)
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        throughput = cnt / np.sum(periods)
+        latency = np.average(periods)
+        acc1 = np.sum(test_info) / cnt
+        return (throughput, latency, acc1)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        val_reader = paddle.dataset.mnist.train()
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams',
+            sample_generator=val_reader,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(
+            self.int8_model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams')
+
+    def run_test(self,
+                 model_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 batch_size=10,
+                 infer_iterations=10,
+                 quant_iterations=5):
+
+        origin_model_path = self.download_model(data_url, data_md5, model_name)
+        #origin_model_path = os.path.join(origin_model_path, model_name)
+
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            origin_model_path, batch_size, infer_iterations)
+
+        print("Start INT8 post training quantization for {0} on {1} images ...".
+              format(model_name, quant_iterations * batch_size))
+        self.generate_quantized_model(
+            origin_model_path, algo, quantizable_op_type, is_full_quantize,
+            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model_path, batch_size, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
+            format(model_name, batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
+            format(model_name, batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForWhile(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "KL"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTraininghistForWhile(TestPostTrainingQuantization):
+    def test_post_training_hist(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "hist"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingmseForWhile(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "mse"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingavgForWhile(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "avg"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingMinMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_min_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "min_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_abs_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "abs_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index d8cb3e0918dc8d0e08c3895c5598b46479958a95..10a9358612960cca302f198b1f8eecd78536a9a0 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -273,6 +273,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
             else:
                 if self._return_list:
                     data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                     data = [
                         _restore_batch(d, s)
                         for d, s in zip(data, self._structure_infos[:len(
@@ -718,6 +720,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             else:
                 if self._return_list:
                     data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                     data = [
                         _restore_batch(d, s)
                         for d, s in zip(data, self._structure_infos[:len(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index c4df01c4c7654a7874a4279869daf813efc543f1..51e85901e7d558c697382c5d433c2cc7661c3213 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -547,7 +547,11 @@ def func_to_source_code(function, dedent=True):
         raise TypeError(
             "The type of 'function' should be a function or method, but received {}.".
             format(type(function).__name__))
-    source_code = inspect.getsource(function)
+    source_code_list, _ = inspect.getsourcelines(function)
+    source_code_list = [
+        line for line in source_code_list if not line.lstrip().startswith('#')
+    ]
+    source_code = ''.join(source_code_list)
     if dedent:
         source_code = textwrap.dedent(source_code)
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 996cec29e185bd55db3d9869ca16819b729f99c9..f308af04e5e580579dc381cbed55b798ca88da0a 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -238,7 +238,7 @@ def monkey_patch_varbase():
                     "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape)
 
-            if paddle.is_compiled_with_xpu():
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
                 core.dygraph_run_backward([scaled_loss], [grad_tensor],
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e601c1cb4c300506cc3f2b58288f8b737c15a2e7..d10564e21ea47b7297abc65f0a90e26b9880b8ab 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1999,6 +1999,14 @@ class Executor(object):
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
                 fetch_var_name=fetch_var_name)
+            main_block = cached_program.block(0)
+            for op in main_block.ops:
+                # set the op_role of fetch op to Optimize to avoid
+                # erase the fetched vars by gc for pipeline
+                if op.type == 'fetch':
+                    op._set_attr(
+                        'op_role',
+                        core.op_proto_and_checker_maker.OpRole.Optimize)
             self._add_program_cache(cache_key, cached_program)
         if cached_ctx is None:
             fleet_opt = program._pipeline_opt["fleet_opt"]
@@ -2007,6 +2015,18 @@ class Executor(object):
             self._add_ctx_cache(cache_key, cached_ctx)
         if feed:
             self._feed_data(cached_program, feed, feed_var_name, cached_scope)
+
+        from paddle.optimizer.lr import LRScheduler
+        if hasattr(program, 'lr_sheduler'):
+            lr_sheduler = program.lr_sheduler
+            assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
+            lr_value = lr_sheduler()
+            lr_var = program.global_block().vars[lr_sheduler._var_name]
+            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+            tensor = core.get_variable_tensor(cached_scope,
+                                              lr_sheduler._var_name)
+            tensor.set(data, self.place)
+
         cached_ctx.run()
         if fetch_list:
             arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index dfc887292e7cff408a4e87ffcf8e4e90523b1e84..83ccd1051bb669ae62bc4d67bed7c4a2f860d17e 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1254,7 +1254,10 @@ class GeneratorLoader(DataLoaderBase):
     def __next__(self):
         try:
             if self._return_list:
-                return self._reader.read_next_list()
+                data = self._reader.read_next_list()
+                for i in range(len(data)):
+                    data[i] = data[i]._move_to_list()
+                return data
             else:
                 return self._reader.read_next()
         except StopIteration:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 05008a3bc12f7ed6077342cf4e3723f2e7002b6c..2b4002ab9c9d49aae9d53bf7b01d174b38c58e3d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -30,6 +30,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import Shar
 seed = 2021
 epoch = 2
 batch_size = 32
+linear_size = 10000
 
 strategy = fleet.DistributedStrategy()
 strategy.hybrid_configs = {
@@ -45,12 +46,12 @@ paddle.seed(seed)
 
 
 class MLP(fluid.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
+    def __init__(self, linear_size=10000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
-        self._linear1 = Linear(10000, 10000)
-        self._linear2 = Linear(10000, 10000)
-        self._linear3 = Linear(10000, 10)
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
 
     def forward(self, inputs):
         y = self._linear1(inputs)
@@ -59,10 +60,10 @@ class MLP(fluid.Layer):
         return y
 
 
-def reader_decorator():
+def reader_decorator(linear_size=10000):
     def __reader__():
         for _ in range(100):
-            img = np.random.rand(10000).astype('float32')
+            img = np.random.rand(linear_size).astype('float32')
             label = np.ones(1).astype('int64')
             yield img, label
 
@@ -120,6 +121,9 @@ def train_mlp(model,
         use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
+    if sharding_stage == 2:
+        model.to(device="gpu")
+
     for eop in range(epoch):
         model.train()
 
@@ -153,9 +157,6 @@ def train_mlp(model,
             if all_test and batch_id == 2:
                 return model.parameters()
 
-    if sharding_stage == 2:
-        model.to(device="gpu")
-
     return model.parameters()
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
new file mode 100644
index 0000000000000000000000000000000000000000..8adcda9d24e1cabb40eee13296a928d9d39fffe0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
@@ -0,0 +1,115 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+from dygraph_sharding_stage2 import MLP, reader_decorator, optimizer_setting
+
+seed = 2021
+epoch = 2
+batch_size = 32
+linear_size = 8000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+def train_mlp(model, offload=False):
+    group = paddle.distributed.new_group([0, 1])
+    optimizer = optimizer_setting(model=model, use_pure_fp16=True)
+
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+    scaler = ShardingScaler(scaler, group)
+
+    optimizer = ShardingOptimizerStage2(
+        params=model.parameters(),
+        optim=optimizer,
+        group=group,
+        offload=offload)
+    model = ShardingStage2(model, optimizer, group=group, accumulate_grads=True)
+
+    train_reader = paddle.batch(
+        reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            scaler.scale(avg_loss).backward()
+
+            model.grad_scale()
+            scaler.step(optimizer)
+            scaler.update()
+            model.clear_gradients()
+
+    for dtype in optimizer.param_storages:
+        for dst_rank, param_storage in optimizer.param_storages[dtype].items():
+            param_storage.to(device="gpu", dtype=dtype)
+
+    return model.parameters()
+
+
+def test_sharding_stage2_offload():
+    mlp = MLP(linear_size)
+    mlp_offload = MLP(linear_size)
+    mlp_offload.set_state_dict(mlp.state_dict())
+
+    mlp_params = train_mlp(mlp, offload=False)
+    mlp_offload_params = train_mlp(mlp_offload, offload=True)
+
+    for i in range(len(mlp_params)):
+        for j in range(len(mlp_offload_params)):
+            if mlp_params[i].name == mlp_offload_params[j].name:
+                np.testing.assert_allclose(
+                    mlp_params[i].numpy(),
+                    mlp_offload_params[j].numpy(),
+                    rtol=1e-6)
+    return
+
+
+if __name__ == '__main__':
+    test_sharding_stage2_offload()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index c08a8d350f8aa83eb2c7e2eae8726917c02bba4f..d4c41781078b760feadbb5e2c928968414d17063 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -26,6 +26,7 @@ import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
 
 from ifelse_simple_func import dyfunc_with_if_else
 
@@ -344,5 +345,18 @@ class TestFunctionTrainEvalMode(unittest.TestCase):
             net.foo.train()
 
 
+class TestRemoveCommentInDy2St(unittest.TestCase):
+    def func_with_comment(self):
+        # Comment1
+        x = paddle.to_tensor([1, 2, 3])
+        # Comment2
+        # Comment3
+        y = paddle.to_tensor([4, 5, 6])
+
+    def test_remove_comment(self):
+        code_string = func_to_source_code(self.func_with_comment)
+        self.assertEqual('#' not in code_string, True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 33754fac127a44d97befef52493d6bea531d217a..c05ad30da2797f21e5b01427de94aa636aab251b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -322,14 +322,14 @@ class PassAutoScanTest(AutoScanTest):
             "Expected operator list after fusion is {}, but now it's {}".format(
                 op_list_after_fusion, after_op_list), )
 
-    def run_and_statis(
-            self,
-            quant=False,
-            max_examples=100,
-            reproduce=None,
-            min_success_num=25,
-            max_duration=180,
-            passes=None, ):
+    def run_and_statis(self,
+                       quant=False,
+                       max_examples=100,
+                       reproduce=None,
+                       min_success_num=25,
+                       max_duration=180,
+                       passes=None,
+                       use_gpu_run_baseline=False):
         if os.getenv('HYPOTHESIS_TEST_PROFILE', 'ci') == "dev":
             max_examples *= 10
             min_success_num *= 10
@@ -354,7 +354,10 @@ class PassAutoScanTest(AutoScanTest):
             return self.sample_program_config(draw)
 
         def run_test(prog_config):
-            return self.run_test(quant=quant, prog_configs=[prog_config])
+            return self.run_test(
+                quant=quant,
+                prog_configs=[prog_config],
+                use_gpu_run_baseline=use_gpu_run_baseline)
 
         generator = st.composite(program_generator)
         loop_func = given(generator())(run_test)
@@ -371,8 +374,8 @@ class PassAutoScanTest(AutoScanTest):
         logging.info("Number of Ran Programs: {}".format(self.num_ran_programs))
         logging.info("Number of Ignore Tests: {}".format(self.num_ignore_tests))
         successful_ran_programs = int(self.num_ran_programs -
-                                      self.num_ignore_tests /
-                                      self.num_predictor_kinds)
+                                      self.num_ignore_tests / max(
+                                          self.num_predictor_kinds, 1))
         logging.info(
             "Number of successfully ran programs approximately equal to {}".
             format(successful_ran_programs))
@@ -391,7 +394,10 @@ class PassAutoScanTest(AutoScanTest):
                 format(max_duration))
             assert False
 
-    def run_test(self, quant=False, prog_configs=None):
+    def run_test(self,
+                 quant=False,
+                 prog_configs=None,
+                 use_gpu_run_baseline=False):
         status = True
 
         for prog_config in prog_configs:
@@ -413,7 +419,9 @@ class PassAutoScanTest(AutoScanTest):
             results: List[Dict[str, np.ndarray]] = []
 
             # baseline: cpu no ir_optim run
-            base_config = self.create_inference_config(ir_optim=False)
+
+            base_config = self.create_inference_config(
+                ir_optim=False, use_gpu=use_gpu_run_baseline)
             logging.info('RUN program_config: ' + str(prog_config))
             results.append(
                 self.run_test_config(model, params, prog_config, base_config,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
index 8cb6af1dcf044150bbaa9f43f241c0e3d988d9a6..96c2a175208faa5b5292b98dc2c040b78379446a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
@@ -109,7 +109,7 @@ class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
     def test(self):
         self.run_and_statis(
             quant=False,
-            max_examples=100,
+            max_examples=300,
             passes=["adaptive_pool2d_convert_global_pass"],
             min_success_num=40)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ccb9de5d5470f977956142bbbc46555544bd7bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestFCElementwiseLayerNormFusePass(PassAutoScanTest):
+    """
+    x_var   w(persistable) bias_var(persistable)
+      \     |              /
+          fc     
+          |
+      fc_out_var  bias_var(persistable)
+            \        /
+          elementwise_add  bias_var(persistable)  scale_var(persistable)
+                  \            |                       /                    
+                           layer_norm
+                         /      |         \
+                        Y    mean_var  variance_var
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fused_fc_elementwise_layernorm"], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape of input:X of fc
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=5))
+        x_shape = [2, 1]
+        x_rank = len(x_shape)
+        # 2. Generate attr:in_num_col_dims of fc
+        in_num_col_dims = draw(st.integers(min_value=1, max_value=x_rank - 1))
+        # 3. Generate legal shape of input:W/bias of fc
+        w_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        w_shape[0] = int(np.prod(x_shape[in_num_col_dims:]))
+        w_shape = [1, 2]
+        fc_bias_shape = [w_shape[1], ]
+        if draw(st.booleans()):
+            fc_bias_shape.insert(0, 1)
+        fc_bias_shape = [2, ]
+        fc_out_shape = x_shape[:in_num_col_dims] + w_shape[1:]
+        # 4. Generate legal attr:axis/shape of elementwise_add
+        add_bias_shape = fc_out_shape[:]
+        axis = draw(st.integers(min_value=-1, max_value=0))
+        # 5. Generate legal shape of layer_norm
+        begin_norm_axis = draw(
+            st.integers(
+                min_value=1, max_value=len(fc_out_shape) - 1))
+        layer_norm_shape = [int(np.prod(fc_out_shape[begin_norm_axis:]))]
+        epsilon = 1e-5
+
+        fc_op = OpConfig(
+            "fc",
+            inputs={"Input": ["fc_x"],
+                    "W": ["fc_w"],
+                    "Bias": ["fc_bias"]},
+            outputs={"Out": ["fc_out"]},
+            in_num_col_dims=in_num_col_dims,
+            padding_weights=False,
+            activation_type="",
+            use_quantizer=False,
+            use_mkldnn=False, )
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["fc_out"],
+                    "Y": ["add_bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis, )
+        layer_norm_op = OpConfig(
+            "layer_norm",
+            inputs={
+                "X": ["add_out"],
+                "Scale": ["scale"],
+                "Bias": ["layer_norm_bias"]
+            },
+            outputs={
+                "Y": ["layer_norm_out"],
+                "Mean": ["layer_norm_mean"],
+                "Variance": ["layer_norm_var"]
+            },
+            begin_norm_axis=begin_norm_axis,
+            epsilon=epsilon)
+
+        ops = [fc_op, add_op, layer_norm_op]
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "fc_w": TensorConfig(shape=w_shape),
+                "fc_bias": TensorConfig(shape=fc_bias_shape),
+                "add_bias": TensorConfig(shape=add_bias_shape),
+                "scale": TensorConfig(shape=layer_norm_shape),
+                "layer_norm_bias": TensorConfig(shape=layer_norm_shape),
+            },
+            inputs={"fc_x": TensorConfig(shape=x_shape), },
+            outputs=ops[-1].outputs["Y"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["fc_elementwise_layernorm_fuse_pass"],
+            use_gpu_run_baseline=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 83d4b7091cb3276ba8e2c1ff9fd7dca9b1692c63..64c3042b63cf8e9bd800dd69f3077161a03fe928 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,72 +12,147 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
-import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import PassVersionChecker
 
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
 
-class TransposeFlattenConcatFusePassTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1)
-            flatt2 = fluid.layers.flatten(trans2)
-            concat_out = fluid.layers.concat([flatt1, flatt2])
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
 
-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+class TestTransposeFlattenConcatFusePass(PassAutoScanTest):
+    """
+        x_1_var              x_2_var
+          |                     |
+      transpose2            transpose2
+          |                     | 
+       flatten2              flatten2
+          \                     /
+    flatten2_out_var    flatten2_out_var
+              \              /
+                   concat 
+    """
 
-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+    def sample_predictor_configs(self, program_config):
+        # TRT  
+        # after tensorrt_subgraph_pass ，The pass needs to be deleted on TRT
 
-        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fusion_transpose_flatten_concat", ], (1e-5, 1e-5)
 
+    def is_program_valid(self, prog_config):
+        concat_axis = prog_config.ops[-1].attrs["axis"]
+        ops_num = len(prog_config.ops) - 1
+        if ops_num % 2 != 0:
+            return False
+        input_num = ops_num // 2
+        flatten_shape = 0
+        x_trans_axis = prog_config.ops[0].attrs["axis"]
+        x_flatten_axis = prog_config.ops[1].attrs["axis"]
+        for i in range(input_num):
+            input_name = "transpose2_x" + str(i)
+            input_shape = prog_config.inputs[input_name].shape
+            trans_axis = prog_config.ops[i * 2].attrs["axis"]
+            if x_trans_axis != trans_axis:
+                return False
+            #  calculate shape after transpose
+            input_shape = [input_shape[j] for j in trans_axis]
+            #  calculate shape after flateen
+            flatten_axis = prog_config.ops[i * 2 + 1].attrs["axis"]
+            if x_flatten_axis != flatten_axis:
+                return False
+            flatten_shape1 = flatten_shape2 = 1
+            for j in range(len(input_shape)):
+                if j < flatten_axis:
+                    flatten_shape1 *= input_shape[j]
+                else:
+                    flatten_shape2 *= input_shape[j]
+            if concat_axis == 0:
+                if i == 0:
+                    flatten_shape = flatten_shape2
+                elif flatten_shape != flatten_shape2:
+                    return False
+            else:
+                if i == 0:
+                    flatten_shape = flatten_shape1
+                elif flatten_shape != flatten_shape1:
+                    return False
+        return True
 
-class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1, axis=2)
-            flatt2 = fluid.layers.flatten(trans2, axis=2)
-            concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
+    def sample_program_config(self, draw):
+        times = draw(st.integers(min_value=1, max_value=6))
+        concat_axis = draw(st.integers(min_value=0, max_value=1))
+        ops = []
+        concat_input = []
+        inputs = {}
+        x_shape_rank = draw(st.integers(min_value=2, max_value=5))
+        #  Generate axis of transpose
+        trans_axis = [j for j in range(x_shape_rank)]
+        for j in range(x_shape_rank - 1):
+            if draw(st.booleans()):
+                trans_axis[j], trans_axis[-1] = trans_axis[-1], trans_axis[j]
+        #  Generate axis of flatten
+        flatten_axis = draw(
+            st.integers(
+                min_value=0, max_value=x_shape_rank - 1))
+        for i in range(times):
+            #  Generate x_shape of transpose
+            x_shape = draw(
+                st.lists(
+                    st.integers(
+                        min_value=1, max_value=10),
+                    min_size=x_shape_rank,
+                    max_size=x_shape_rank))
 
-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+            str_i = str(i)
+            transpose_op = OpConfig(
+                "transpose2",
+                inputs={"X": ["transpose2_x" + str_i], },
+                axis=trans_axis,
+                outputs={
+                    "Out": ["trans_out" + str_i],
+                    "XShape": ["trans_shape" + str_i]
+                }, )
+            ops.append(transpose_op)
+            flatten_op = OpConfig(
+                "flatten2",
+                inputs={"X": ["trans_out" + str_i], },
+                axis=flatten_axis,
+                outputs={
+                    "Out": ["flatten2_out" + str_i],
+                    "XShape": ["xshape" + str_i]
+                }, )
+            concat_input.append("flatten2_out" + str_i)
+            ops.append(flatten_op)
+            inputs["transpose2_x" + str_i] = TensorConfig(shape=x_shape)
 
-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+        concat_op = OpConfig(
+            "concat",
+            inputs={
+                "X": concat_input,
+                "AxisTensor": [],
+            },
+            outputs={"Out": ["concat_out"]},
+            axis=concat_axis, )
 
-        self.assertTrue(
-            PassVersionChecker.IsCompatible(
-                'transpose_flatten_concat_fuse_pass'))
+        ops.append(concat_op)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs=inputs,
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["transpose_flatten_concat_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..82178e1b62dfbc60387bc84b8b852194f5bdc0df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import copy
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.fluid.core as core
+from paddle.fluid import layers
+from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImplContainer
+from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImpl
+from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_op import DistributedOperator
+paddle.enable_static()
+device = "gpu" if core.is_compiled_with_cuda() else "cpu"
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sqrt_hidden_size = 32
+        double_hidden_size = 64
+
+        input = static.data(name="input", shape=[8, 8, 16], dtype='int32')
+        input = paddle.reshape(input, [hidden_size])
+        input = paddle.reshape(input, [sqrt_hidden_size, sqrt_hidden_size])
+        embedding = paddle.nn.Embedding(2, batch_size, sparse=True)
+        input = embedding(input)
+        input = paddle.reshape(input, [hidden_size, batch_size])
+        input = paddle.transpose(input, perm=[1, 0])
+        matmulinput = static.data(
+            name="matmulinput",
+            shape=[hidden_size, hidden_size],
+            dtype='float32')
+        input = layers.matmul(x=input, y=matmulinput)
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+        m = paddle.nn.Softmax()
+        loss = m(loss)
+    return loss, train_program, start_program
+
+
+class Testcompatible(unittest.TestCase):
+    def test_matmulv2_matmul_2_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+    def test_matmulv2_matmul_1_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 6, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1])
+                    dist_op = DistributedOperator(op, op_dist_attr)
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertTrue(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(out, [-1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
+                    self.assertTrue(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+    def test_matmulv2_matmul_0_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, 0])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [0, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 0, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, 1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_complex_view_op.py b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dac121ff3e382f5ee6f453d25195ca26e614345
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle
+from paddle.fluid import dygraph
+from paddle import static
+paddle.enable_static()
+
+
+def ref_view_as_complex(x):
+    real, imag = np.take(x, 0, axis=-1), np.take(x, 1, axis=-1)
+    return real + 1j * imag
+
+
+def ref_view_as_real(x):
+    return np.stack([x.real, x.imag], -1)
+
+
+class TestViewAsComplexOp(OpTest):
+    def setUp(self):
+        self.op_type = "as_complex"
+        x = np.random.randn(10, 10, 2).astype("float64")
+        out_ref = ref_view_as_complex(x)
+        self.out_grad = np.ones(
+            [10, 10], dtype="float64") + 1j * np.ones(
+                [10, 10], dtype="float64")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[ref_view_as_real(self.out_grad)],
+            user_defined_grad_outputs=[self.out_grad])
+
+
+class TestViewAsRealOp(OpTest):
+    def setUp(self):
+        self.op_type = "as_real"
+        real = np.random.randn(10, 10).astype("float64")
+        imag = np.random.randn(10, 10).astype("float64")
+        x = real + 1j * imag
+        out_ref = ref_view_as_real(x)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_ref}
+        self.out_grad = np.ones([10, 10, 2], dtype="float64")
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[ref_view_as_complex(self.out_grad)],
+            user_defined_grad_outputs=[self.out_grad])
+
+
+class TestViewAsComplexAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(10, 10, 2)
+        self.out = ref_view_as_complex(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.as_complex(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[10, 10, 2], dtype="float64")
+            out = paddle.as_complex(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+class TestViewAsRealAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(10, 10) + 1j * np.random.randn(10, 10)
+        self.out = ref_view_as_real(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.as_real(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[10, 10], dtype="complex128")
+            out = paddle.as_real(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
index 8b4eae8ada4e8bd93eea09e5da66c041cb30c5fa..66228856effe4a76fcc7b4b517f0e8b724e2cc49 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -34,7 +34,8 @@ class TestCUDAGraph(unittest.TestCase):
             paddle.set_flags({
                 'FLAGS_allocator_strategy': 'auto_growth',
                 'FLAGS_sync_nccl_allreduce': False,
-                'FLAGS_cudnn_deterministic': True
+                'FLAGS_cudnn_deterministic': True,
+                'FLAGS_use_stream_safe_cuda_allocator': False,
             })
 
     def random_tensor(self, shape):
@@ -187,6 +188,48 @@ class TestCUDAGraph(unittest.TestCase):
         finally:
             graph.reset()
 
+    def test_dataloader(self):
+        if not can_use_cuda_graph():
+            return
+
+        class AutoIncDataset(paddle.io.Dataset):
+            def __init__(self, n, dtype):
+                self.n = n
+                self.dtype = dtype
+
+            def __len__(self):
+                return self.n
+
+            def __getitem__(self, idx):
+                return np.array([idx]).astype(self.dtype)
+
+        n = 100
+        dtype = 'int64'
+        dataset = AutoIncDataset(n, dtype)
+        data_loader = paddle.io.DataLoader(
+            dataset, batch_size=1, num_workers=2, use_buffer_reader=True)
+        x = None
+        y = None
+
+        graph = None
+        for i, data in enumerate(data_loader):
+            if graph is None:
+                x = data
+                x = x.cuda()
+                graph = CUDAGraph()
+                graph.capture_begin()
+                y = x * x
+                graph.capture_end()
+            else:
+                x.copy_(data, False)
+                x = x.cuda()
+
+            graph.replay()
+            actual_x = np.array([[i]]).astype(dtype)
+            actual_y = np.array([[i * i]]).astype(dtype)
+            self.assertTrue(np.array_equal(actual_x, x.numpy()))
+            self.assertTrue(np.array_equal(actual_y, y.numpy()))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index f1c12c90490c2513e945900f67030ac5c22c3409..6cf2c5f6e2ca4b1af01c33c8086876e5dbcfbf1a 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -336,6 +336,29 @@ class UniformTest11(UniformTest):
                 name='values', shape=[dims], dtype='float32')
 
 
+class UniformTestSample(unittest.TestCase):
+    def setUp(self):
+        self.init_param()
+
+    def init_param(self):
+        self.low = 3.0
+        self.high = 4.0
+
+    def test_uniform_sample(self):
+        paddle.disable_static()
+        uniform = Uniform(low=self.low, high=self.high)
+        s = uniform.sample([100])
+        self.assertTrue((s >= self.low).all())
+        self.assertTrue((s < self.high).all())
+        paddle.enable_static()
+
+
+class UniformTestSample2(UniformTestSample):
+    def init_param(self):
+        self.low = -5.0
+        self.high = 2.0
+
+
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
         self.loc = np.array(loc)
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
index c5cf8c5d5ed690c908ff1bfe4f2330f71ebbc008..f76dcb5687c2ab77e411e7ef3c4de64200d99c66 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus):
     def test_dygraph_sharding_optimizer_stage2(self):
         self.run_mnist_2gpu('dygraph_sharding_stage2.py')
 
+    def test_dygraph_sharding_optimizer_stage2_offload(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index 09f9fa6ce105dfe09228978446e58fbb66b7729f..fbc5db341e5e9bb5b20cd2e9b2e2f82ce3e568eb 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -47,6 +47,18 @@ class TestFleetExecutor(unittest.TestCase):
                 name='y', shape=y_data.shape, dtype=y_data.dtype)
             z = x + y
             a = 2 * x + 3 * y
+            loss = paddle.mean(a)
+            base_lr = 0.1
+            passes = [30, 60, 80, 90]
+            steps_per_pass = 10
+            bd = [steps_per_pass * p for p in passes]
+            lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=bd, values=lr)
+            opt = paddle.optimizer.AdamW(
+                learning_rate=lr_val,
+                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+            opt.minimize(loss)
         # TODO: section_program will be removed in the future
         empty_program._pipeline_opt = {
             "fleet_opt": self.fake_fleet_opt(),
diff --git a/python/paddle/fluid/tests/unittests/test_gcd.py b/python/paddle/fluid/tests/unittests/test_gcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..820216dc56cd60e75aaca4ef0c3bcb3d93743450
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gcd.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestGcdAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            y = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.gcd(x, y)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.gcd(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np)
+        y = paddle.to_tensor(self.y_np)
+        result = paddle.gcd(x, y)
+        self.assertEqual(
+            np.allclose(np.gcd(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestGcdAPI2(TestGcdAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestGcdAPI3(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI4(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI5(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
diff --git a/python/paddle/fluid/tests/unittests/test_lcm.py b/python/paddle/fluid/tests/unittests/test_lcm.py
new file mode 100644
index 0000000000000000000000000000000000000000..123c3e3d444e1be7062986e4820ff7bfc3df882c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lcm.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestLcmAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x1 = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            x2 = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.lcm(x1, x2)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.lcm(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor(self.x_np)
+        x2 = paddle.to_tensor(self.y_np)
+        result = paddle.lcm(x1, x2)
+        self.assertEqual(
+            np.allclose(np.lcm(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestLcmAPI2(TestLcmAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestLcmAPI3(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI4(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI5(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index ce84fb739c0009c62bfd2c9c9d9fd74255c96312..722003c034091957ef66cd2202139a8e2f00cd8c 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -23,6 +23,7 @@ from .framework import set_grad_enabled  # noqa: F401
 from ..fluid.param_attr import ParamAttr  # noqa: F401
 from ..fluid.layers.tensor import create_parameter  # noqa: F401
 from ..fluid.core import CPUPlace  # noqa: F401
+from ..fluid.core import IPUPlace  # noqa: F401
 from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
 from ..fluid.core import NPUPlace  # noqa: F401
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 53001c071598941449fb0dc6f43a7f4843ea7c32..36dfd717a12a0fa7664fb50cd94646034303ead5 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -111,6 +111,9 @@ from .manipulation import unbind  # noqa: F401
 from .manipulation import roll  # noqa: F401
 from .manipulation import chunk  # noqa: F401
 from .manipulation import tensordot  # noqa: F401
+from .manipulation import as_complex  # noqa: F401
+from .manipulation import as_real  # noqa: F401
+
 from .math import abs  # noqa: F401
 from .math import acos  # noqa: F401
 from .math import asin  # noqa: F401
@@ -194,6 +197,8 @@ from .math import lerp  # noqa: F401
 from .math import lerp_  # noqa: F401
 from .math import rad2deg  # noqa: F401
 from .math import deg2rad  # noqa: F401
+from .math import gcd  # noqa: F401
+from .math import lcm  # noqa: F401
 from .math import diff  # noqa: F401
 from .math import angle  # noqa: F401
 
@@ -409,6 +414,12 @@ tensor_method_func  = [ #noqa
            'multi_dot',
            'solve',
            'triangular_solve',
+           'as_complex',
+           'as_real',
+           'rad2deg',
+           'deg2rad',
+           'gcd',
+           'lcm',
            'diff',
            'lerp',
            'lerp_',
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index b4f00ab4ffb8beb7bd67d05f440a7d306ea1bedc..a81d8c54ffc42fd4e571c1901db5ed4e42ab23f0 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -34,6 +34,7 @@ from ..fluid import layers
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
 from paddle import _C_ops
+from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype
 
 __all__ = []
 
@@ -2488,3 +2489,94 @@ def tensordot(x, y, axes=2, name=None):
         [contraction_size, not_contraction_size_y])
     out = x.matmul(y).reshape(shape_out)
     return out
+
+
+def as_complex(x, name=None):
+    """Transform a real tensor to a complex tensor. 
+    
+    The data type of the input tensor is 'float32' or 'float64', and the data
+    type of the returned tensor is 'complex64' or 'complex128', respectively.
+
+    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e. 
+    the size of the last axis shoule be 2, which represent the real and imag part
+    of a complex number. The shape of the returned tensor is ``(*,)``.
+
+    Args:
+        x (Tensor): The input tensor. Data type is 'float32' or 'float64'.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            y = paddle.as_complex(x)
+            print(y.numpy())
+
+            # [[ 0. +1.j  2. +3.j  4. +5.j]
+            #  [ 6. +7.j  8. +9.j 10.+11.j]]
+    """
+    if in_dygraph_mode():
+        return paddle._C_ops.as_complex(x)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'as_complex')
+    op_type = "as_complex"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_real_to_complex_dtype(x.dtype))
+    outputs = {"Out": out}
+    attrs = {}
+    helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
+    return out
+
+
+def as_real(x, name=None):
+    """Transform a complex tensor to a real tensor. 
+    
+    The data type of the input tensor is 'complex64' or 'complex128', and the data 
+    type of the returned tensor is 'float32' or 'float64', respectively.
+
+    When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
+    the shape of the output tensor is ``(*, 2)``, i.e. the shape of the output is
+    the shape of the input appended by an extra ``2``.
+
+    Args:
+        x (Tensor): The input tensor. Data type is 'complex64' or 'complex128'.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            y = paddle.as_complex(x)
+            z = paddle.as_real(y)
+            print(z.numpy())
+
+            # [[[ 0.  1.]
+            #   [ 2.  3.]
+            #   [ 4.  5.]]
+
+            #  [[ 6.  7.]
+            #   [ 8.  9.]
+            #   [10. 11.]]]
+    """
+    if in_dygraph_mode():
+        return paddle._C_ops.as_real(x)
+
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'as_real')
+    op_type = "as_real"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_complex_to_real_dtype(x.dtype))
+    outputs = {"Out": out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index f705510f848be122cef73d6a694e14dd0d464839..fefaecaf604a079e4cce4d3119019b0a0e25cf22 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None):
             lerp(x, y, weight) = x + weight * (y - x).
 
     Args:
-        x (Tensor): An N-D Tensor, the data type is float32, float64.
-        y (Tensor): An N-D Tensor, the data type is float32, float64.
-        weight (float|Tensor): the weight for the interpolation formula.
+        x (Tensor): An N-D Tensor with starting points, the data type is float32, float64.
+        y (Tensor): An N-D Tensor with ending points, the data type is float32, float64.
+        weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2788,6 +2788,139 @@ def deg2rad(x, name=None):
             type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale})
         return out
 
+def gcd(x, y, name=None):
+    """
+    Computes the element-wise greatest common divisor (GCD) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        gcd(0,0)=0, gcd(0, y)=|y|
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.gcd(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.gcd(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20, 1 , 2 , 1 , 4 , 5])
+
+            x4 = paddle.to_tensor(0)
+            paddle.gcd(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20])
+
+            paddle.gcd(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.gcd(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+    """
+    shape = paddle.broadcast_shape(x.shape, y.shape)
+    x = paddle.broadcast_to(x, shape)
+    y = paddle.broadcast_to(y, shape)
+    x = paddle.abs(x)
+    y = paddle.abs(y)
+
+    def _gcd_cond_fn(x, y):
+        return paddle.any(y != 0)
+
+    def _gcd_body_fn(x, y):
+        # paddle.mod will raise an error when any element of y is 0. To avoid
+        # that, we change those zeros to ones. Their values don't matter because
+        # they won't be used.
+        y_not_equal_0 = (y != 0)
+        y_safe = paddle.where(y_not_equal_0, y, paddle.ones(y.shape, y.dtype))
+        x, y = (paddle.where(y_not_equal_0, y, x),
+                  paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype)))
+        return (paddle.where(x < y, y, x), paddle.where(x < y, x, y))
+
+    if in_dygraph_mode():
+        while _gcd_cond_fn(x, y):
+            x, y = _gcd_body_fn(x, y)
+
+        return x
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y])
+        return out
+
+def lcm(x, y, name=None):
+    """
+    Computes the element-wise least common multiple (LCM) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        lcm(0,0)=0, lcm(0, y)=0
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.lcm(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.lcm(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0, 20, 20, 60, 20, 20])
+
+            x4 = paddle.to_tensor(0)
+            paddle.lcm(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+
+            paddle.lcm(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.lcm(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+    """
+    d = paddle.gcd(x, y)
+    # paddle.mod will raise an error when any element of y is 0. To avoid
+    # that, we change those zeros to ones. Their values don't matter because
+    # they won't be used.
+    d_equal_0 = paddle.equal(d, 0)
+    d_safe = paddle.where(d_equal_0, paddle.ones(d.shape, d.dtype), d)
+    out = paddle.where(d_equal_0, paddle.zeros(d.shape, d.dtype), paddle.abs(x * y) // d_safe)
+    return out
+
 def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     r"""
     Computes the n-th forward difference along the given axis.
@@ -2949,7 +3082,6 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
 
         return out
 
-
 def angle(x, name=None):
     r"""
     Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while 
@@ -2965,7 +3097,7 @@ def angle(x, name=None):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): y (Tensor): An N-D Tensor of real data type with the same precision as that of x's data type.
+        Tensor: An N-D Tensor of real data type with the same precision as that of x's data type.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 3d61caae002e99d3b78cfaf1338338b02a1f800d..2c47bbe4566d6be70c524150bc2f2d6cdbbf81ce 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -76,7 +76,7 @@
   infer_meta : 
     func : MatmulInferMeta
   kernel : 
-    func : matmul_v2
+    func : matmul
 
 - api : mean
   args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 5506ee95bd7c9ed40f4b53a23109530f2e7a46cf..ed3bb1dc5f1f0122eb5be79ac07b5067c1288ed1 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -345,6 +345,7 @@ def source_include(header_file_path):
 #include "glog/logging.h"
 
 #include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_declare.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -353,22 +354,6 @@ def source_include(header_file_path):
 """
 
 
-def module_declare():
-    return """
-PT_DECLARE_MODULE(CreationCPU);
-PT_DECLARE_MODULE(LinalgCPU);
-PT_DECLARE_MODULE(ManipulationCPU);
-PT_DECLARE_MODULE(MathCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(CreationCUDA);
-PT_DECLARE_MODULE(LinalgCUDA);
-PT_DECLARE_MODULE(ManipulationCUDA);
-PT_DECLARE_MODULE(MathCUDA);
-#endif
-"""
-
-
 def api_register():
     return """
 PT_REGISTER_API(Creation);
@@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 
     include_header_file = "paddle/pten/api/include/api.h"
     source_file.write(source_include(include_header_file))
-    source_file.write(module_declare())
     source_file.write(namespace[0])
 
     for api in apis:
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 79a742c314bd07f5a39c2d63a3821b64696b6521..46b415e24d8ca214ef1788156c6e00c205a7e7fa 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -202,7 +202,7 @@ HIGH_PARALLEL_JOB_NEW = [
     'test_fleet_runtime',
     'test_rnn_cudnn_params_packing',
     'test_mkldnn_placement_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
     'program_desc_test',
     'test_simplify_with_basic_ops_pass',
     'test_dygraph_mode_of_unittest',
@@ -1417,7 +1417,7 @@ CPU_PARALLEL_JOB = [
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
     'test_fc_gru_fuse_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
     'test_fc_bf16_mkldnn_op',
     'test_executor_feed_non_tensor',
     'test_executor_check_feed',