Merge branch 'develop' into revert-37926-eager_coreops_500

883ee1a3 · wanghuancoder · GitHub · 823208fe · 11c785a4 · 883ee1a3
141 changed file
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -75,6 +75,11 @@ class Carrier final {

  bool IsInit() const;

+  // NOTE: This mutex will be used in interceptor's RunOps function.
+  // This mutex is used for avoiding forward ops and backward ops run
+  // simultaneously, which will lead to a random hang for some sync ops.
+  std::mutex run;
+
  DISABLE_COPY_AND_ASSIGN(Carrier);

 private:

--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"

 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }

 void ComputeInterceptor::RunOps() {
+  Carrier& carrier_instance = Carrier::Instance();
+  std::unique_lock<std::mutex> lock(carrier_instance.run);
  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
          << step_ + 1 << " time.";
  for (auto op : node_->ops()) {

--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #endif

+#ifdef PADDLE_WITH_IPU
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
  void operator()(const paddle::platform::NPUPinnedPlace& place) {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
    return device;
  }

+  inline ::DLDevice operator()(const platform::IPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::IPUPlace is not supported"));
+  }
+
  inline ::DLDevice operator()(const platform::XPUPlace &place) const {
    PADDLE_THROW(
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_ipu_place(place_)) {
+#ifdef PADDLE_WITH_IPU
+      gc.reset(new IPUGarbageCollector(
+          BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
 #endif
    } else if (platform::is_npu_place(place_)) {
 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
 cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
+cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
 cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass)
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)

--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace framework {
@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {

 REGISTER_PASS(fc_elementwise_layernorm_fuse_pass,
              paddle::framework::ir::FCElementwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(fc_elementwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .LE("elementwise_add", 1)
+            .EQ("layer_norm", 0));
--- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AvgShardPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter AvgShardPass::ApplyImpl";
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  if (ipu_backend->GetIpuStrategy()->need_avg_shard) {
+    VLOG(10) << "start AvgShardPass";
+    auto nodes = ir::TopologySortOperations(*graph);
+    auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus;
+
+    int shard_position = nodes.size() / num_ipus;
+    int index_and_stage = -1;
+    for (int i = 0; i < nodes.size(); i++) {
+      if ((i % shard_position) == 0 && index_and_stage < num_ipus - 1) {
+        index_and_stage++;
+      }
+      nodes[i]->Op()->SetAttr("ipu_index", index_and_stage);
+      nodes[i]->Op()->SetAttr("ipu_stage", index_and_stage);
+    }
+    VLOG(10) << "end AvgShardPass";
+  }
+
+  VLOG(10) << "leave AvgShardPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(avg_shard_pass, paddle::framework::ir::AvgShardPass);
--- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AvgShardPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ForwardGraphExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter ForwardGraphExtractPass::ApplyImpl";
+
+  std::unordered_map<OpRole, std::unordered_set<ir::Node*>> all_ops{
+      {OpRole::kForward, {}},  {OpRole::kBackward, {}},
+      {OpRole::kOptimize, {}}, {OpRole::kRPC, {}},
+      {OpRole::kDist, {}},     {OpRole::kLRSched, {}},
+      {OpRole::kLoss, {}},     {OpRole::kNotSpecified, {}}};
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto op_role = BOOST_GET_MUTABLE(int, node->Op()->GetAttr("op_role"));
+    if (op_role == static_cast<int>(OpRole::kForward)) {
+      all_ops[OpRole::kForward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kBackward)) {
+      all_ops[OpRole::kBackward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kOptimize)) {
+      all_ops[OpRole::kOptimize].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kRPC)) {
+    } else if (op_role == static_cast<int>(OpRole::kDist)) {
+    } else if (op_role == static_cast<int>(OpRole::kLRSched)) {
+    } else if (op_role == static_cast<int>(OpRole::kLoss)) {
+      all_ops[OpRole::kLoss].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kNotSpecified)) {
+      LOG(WARNING) << "Op: " << node->Name() << " OpRole is NotSpecified ";
+    }
+  }
+
+  std::unordered_set<ir::Node*> forward_vars;
+  std::unordered_set<ir::Node*> backward_vars;
+  std::unordered_set<ir::Node*> control_vars;
+  // forward_vars
+  for (auto& nodes : std::array<std::unordered_set<ir::Node*>, 2>{
+           all_ops[OpRole::kForward], all_ops[OpRole::kLoss]}) {
+    for (auto* node : nodes) {
+      for (auto* in_node : node->inputs) {
+        forward_vars.insert(in_node);
+      }
+      for (auto* out_node : node->outputs) {
+        forward_vars.insert(out_node);
+      }
+    }
+  }
+  // control_vars &  backward_vars
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    if (node->IsCtrlVar()) {
+      control_vars.insert(node);
+    }
+    for (auto* in_node : node->inputs) {
+      if (all_ops[OpRole::kOptimize].count(in_node)) {
+        backward_vars.insert(node);
+      }
+    }
+  }
+  // all removed node
+  std::unordered_set<ir::Node*> rm_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (backward_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (control_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kBackward].count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kForward].count(node) == 0 &&
+               all_ops[OpRole::kLoss].count(node) == 0 &&
+               forward_vars.count(node) == 0) {
+      rm_nodes.insert(node);
+    } else if (node->Name() == "feed" || node->Name() == "fetch") {
+      rm_nodes.insert(node);
+    }
+  }
+
+  VLOG(10) << "Remove Node: ";
+  for (auto* node : rm_nodes) {
+    // rm node releations
+    for (auto* node_in : node->inputs) {
+      for (size_t i = 0; i < node_in->outputs.size(); ++i) {
+        if (node_in->outputs[i] == node) {
+          node_in->outputs.erase(node_in->outputs.begin() + i);
+          break;
+        }
+      }
+    }
+    for (auto* node_out : node->outputs) {
+      for (size_t i = 0; i < node_out->inputs.size(); ++i) {
+        if (node_out->inputs[i] == node) {
+          node_out->inputs.erase(node_out->inputs.begin() + i);
+          break;
+        }
+      }
+    }
+    VLOG(10) << "\t" << node->Name();
+    graph->RemoveNode(node);
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  VLOG(10) << "leave ForwardGraphExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(forward_graph_extract_pass,
+              paddle::framework::ir::ForwardGraphExtractPass);
--- a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ForwardGraphExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferShapePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferShapePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  auto batch_size = ipu_backend->GetIpuStrategy()->batch_size;
+
+  auto feed_list = Get<std::vector<std::string>>("feed_list");
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    bool is_feed = std::find(feed_list.begin(), feed_list.end(),
+                             node->Name()) != feed_list.end();
+    if (is_feed) {
+      auto input_shape = node->Var()->GetShape();
+      if (input_shape[0] <= -1) {
+        input_shape[0] = batch_size;
+        node->Var()->SetShape(input_shape);
+      }
+      // int64->int32
+      if (node->Var()->GetDataType() == proto::VarType::INT64) {
+        node->Var()->SetDataType(proto::VarType::INT32);
+      }
+    }
+  }
+
+  // temp scope for shape inference
+  std::shared_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    auto var_desc = node->Var();
+    auto* ptr = scope->Var(var_desc->Name());
+    paddle::framework::InitializeVariable(ptr, var_desc->GetType());
+
+    auto tensor = ptr->GetMutable<paddle::framework::LoDTensor>();
+    tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape()));
+  }
+
+  // infer shape
+  auto nodes = ir::TopologySortOperations(*graph);
+  for (auto node : nodes) {
+    auto op_desc = node->Op();
+    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope);
+    op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx);
+
+    for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) {
+      for (int i = 0; i < it->second.size(); i++) {
+        auto output_name = op_desc->Output(it->first)[i];
+        auto dim =
+            it->second[i]->GetMutable<paddle::framework::LoDTensor>()->dims();
+        auto new_shape = paddle::framework::vectorize(dim);
+        for (auto output_node : node->outputs) {
+          if (output_node->Name() == output_name) {
+            output_node->Var()->SetShape(new_shape);
+          }
+        }
+      }
+    }
+  }
+  // release the temp scope
+  scope.reset();
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave InferShapePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_shape_pass, paddle::framework::ir::InferShapePass)
+    .RequirePassAttr("feed_list");
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferShapePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferencePostprocessPass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(10) << "enter InferencePostprocessPass::ApplyImpl";
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  auto *feed_var = new paddle::framework::VarDesc("feed");
+  feed_var->SetType(proto::VarType::FEED_MINIBATCH);
+  auto *feed_var_node = graph->CreateVarNode(feed_var);
+
+  auto *fetch_var = new paddle::framework::VarDesc("fetch");
+  fetch_var->SetType(proto::VarType::FETCH_LIST);
+  auto *fetch_var_node = graph->CreateVarNode(fetch_var);
+
+  for (int i = 0; i < feed_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == feed_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("feed");
+        op->SetInput("X", {"feed"});
+        op->SetOutput("Out", {node->Name()});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->inputs.push_back(op_node);
+        op_node->outputs.push_back(node);
+        feed_var_node->outputs.push_back(op_node);
+        op_node->inputs.push_back(feed_var_node);
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < fetch_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == fetch_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("fetch");
+        op->SetInput("X", {node->Name()});
+        op->SetOutput("Out", {"fetch"});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->outputs.push_back(op_node);
+        op_node->inputs.push_back(node);
+        fetch_var_node->inputs.push_back(op_node);
+        op_node->outputs.push_back(fetch_var_node);
+        break;
+      }
+    }
+  }
+
+  VLOG(10) << "leave InferencePostprocessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_postprocess_pass,
+              paddle::framework::ir::InferencePostprocessPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferencePostprocessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferenceProcessPass::ApplyImpl";
+
+  // Get a new instance of ipu_backend
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetNewInstance();
+
+  // Set scope
+  auto& scope = graph->Get<Scope>(kParamScopeAttr);
+  ipu_backend->SetScope(scope);
+
+  // Set ipu_strategy
+  static std::shared_ptr<platform::ipu::IpuStrategy> ipu_strategy_instance_(
+      new platform::ipu::IpuStrategy());
+  ipu_strategy_instance_->is_training = false;
+  auto num_ipus = graph->Get<int>("num_ipus");
+  ipu_strategy_instance_->num_ipus = num_ipus;
+  if (num_ipus > 1) {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Manual;
+  } else {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Off;
+  }
+
+  auto enable_pipelining = graph->Get<bool>("enable_pipelining");
+  ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining;
+  if (enable_pipelining) {
+    auto batches_per_step = graph->Get<int>("batches_per_step");
+    PADDLE_ENFORCE_GE(
+        batches_per_step, num_ipus,
+        platform::errors::InvalidArgument("Batched per step should be equal or "
+                                          "greater than the number of IPUs"));
+    ipu_strategy_instance_->batches_per_step = batches_per_step;
+  }
+  ipu_strategy_instance_->batch_size = graph->Get<int>("batch_size");
+  ipu_strategy_instance_->need_avg_shard = graph->Get<bool>("need_avg_shard");
+
+  ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
+
+  // Get feed_list and fetch list
+  std::vector<std::string> feed_list = {};
+  std::vector<std::string> fetch_list = {};
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list.push_back("");
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list.push_back("");
+      }
+    }
+  }
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->outputs[0]->Name();
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->inputs[0]->Name();
+      }
+    }
+  }
+
+  // Run passes
+  std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
+                                         "infer_shape_pass", "avg_shard_pass",
+                                         "popart_canonicalization_pass"};
+  std::vector<std::string> compile_pass = {
+      "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
+      "inference_postprocess_pass"};
+  for (auto pass_name : graph_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "infer_shape_pass") {
+      pass->Set("feed_list", new std::vector<std::string>(feed_list.begin(),
+                                                          feed_list.end()));
+    }
+    pass->Apply(graph);
+  }
+
+  for (auto pass_name : compile_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    pass->Set("feed_list",
+              new std::vector<std::string>(feed_list.begin(), feed_list.end()));
+    pass->Set("fetch_list", new std::vector<std::string>(fetch_list.begin(),
+                                                         fetch_list.end()));
+    pass->Apply(graph);
+  }
+
+  VLOG(10) << "leave InferenceProcessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_process_pass,
+              paddle::framework::ir::InferenceProcessPass);
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.h
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferenceProcessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuGraphBuilderPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  ipu_backend->Compile(graph, feed_list, fetch_list);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuGraphBuilderPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_graph_builder_pass,
+              paddle::framework::ir::IpuGraphBuilderPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuGraphBuilderPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::string GenerateVarName(Node *node) {
+  return node->Name() + "_" + std::to_string(node->id());
+}
+
+void IpuInplacePass::ApplyImpl(ir::Graph *graph) const {
+  // use this pass after forward_graph_extract_pass
+  // raise error if the inplaced var both in feed_list & fetch_list
+  VLOG(10) << "enter IpuInplacePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::map<std::string, int> var_name;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name.find(node->Name()) == var_name.end()) {
+        var_name.emplace(node->Name(), 1);
+      } else {
+        var_name[node->Name()]++;
+      }
+    }
+  }
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name[node->Name()] > 1) {
+        auto is_feed = (std::find(feed_list.begin(), feed_list.end(),
+                                  node->Name()) != feed_list.end()) &&
+                       (node->inputs.size() == 0);
+        auto is_fetch = (std::find(fetch_list.begin(), fetch_list.end(),
+                                   node->Name()) != fetch_list.end()) &&
+                        (node->outputs.size() == 0);
+        if (!is_feed && !is_fetch && !node->Var()->Persistable()) {
+          auto old_name = node->Name();
+          auto new_name = GenerateVarName(node);
+          node->RenameVar(new_name);
+          for (auto *op_in : node->inputs) {
+            op_in->Op()->RenameOutput(old_name, new_name);
+          }
+          for (auto *op_out : node->outputs) {
+            op_out->Op()->RenameInput(old_name, new_name);
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuInplacePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_inplace_pass, paddle::framework::ir::IpuInplacePass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuInplacePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IPUPassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IPUPassBase : public Pass {
+ public:
+  void Init(const std::string& repr, Graph* graph) const;
+  virtual ~IPUPassBase() {}
+
+ protected:
+  mutable Graph* graph_;
+  mutable std::string repr_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuRuntimeReplacerPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  framework::OpDesc ipu_rt_op_desc;
+  ipu_rt_op_desc.SetType("ipu_runtime");
+  ipu_rt_op_desc.SetInput("FeedList", feed_list);
+  ipu_rt_op_desc.SetOutput("FetchList", fetch_list);
+  ipu_rt_op_desc.Flush();
+
+  // Create a new node for the ipu_runtime_op.
+  auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      for (auto feed : feed_list) {
+        if (node->Name() == feed) {
+          IR_NODE_LINK_TO(node, ipu_rt_node);
+        }
+      }
+      for (auto fetch : fetch_list) {
+        if (node->Name() == fetch) {
+          IR_NODE_LINK_TO(ipu_rt_node, node);
+        }
+      }
+    }
+  }
+
+  // set ipu_runtime_op dtype attr
+  if (fetch_list.size() == 1) {
+    for (auto* node : graph->Nodes()) {
+      if (node->IsVar()) {
+        for (auto fetch : fetch_list) {
+          if (node->Name() == fetch) {
+            ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType());
+          }
+        }
+      }
+    }
+  }
+
+  // Remove unneeded nodes.
+  std::unordered_set<const Node*> marked_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op_desc = node->Op();
+      if (op_desc->Type() != "ipu_runtime") {
+        marked_nodes.insert(node);
+      }
+    }
+  }
+
+  GraphSafeRemoveNodes(graph, marked_nodes);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuRuntimeReplacerPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_runtime_replacer_pass,
+              paddle::framework::ir::IpuRuntimeReplacerPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuRuntimeReplacerPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      // graph usually have multiple optimizer node for different parameter,
+      // and these node have the same type and attr value usually
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type());
+        VLOG(10) << "found optimizer type: " << node->Op()->Type();
+
+        for (const std::string& attr_name : node->Op()->AttrNames()) {
+          auto attr_type = node->Op()->GetAttrType(attr_name);
+          // with adam, attr are float
+          if (attr_type == proto::AttrType::FLOAT) {
+            auto attr_value =
+                BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name));
+            ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value);
+          } else {
+            VLOG(10) << "Skip " << attr_type;
+          }
+        }
+
+        auto lr_var_name = node->Op()->Input("LearningRate");
+        PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u,
+                          platform::errors::InvalidArgument(
+                              "In op(%s), find input(LearningRate) failed.",
+                              node->Op()->Type()));
+
+        ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]);
+      }
+
+      if ((op_role == static_cast<int>(framework::OpRole::kLoss))) {
+        VLOG(10) << "found loss op type: " << node->Op()->Type();
+        auto outputs = node->Op()->Outputs();
+        PADDLE_ENFORCE_EQ(
+            outputs.size(), 1,
+            platform::errors::InvalidArgument("Can only support one loss key"));
+
+        auto losses_name = outputs.begin()->second;
+        PADDLE_ENFORCE_EQ(losses_name.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "Can only support one loss name"));
+
+        ipu_backend->GetExecutor().SetLoss(losses_name[0]);
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_extract_pass,
+              paddle::framework::ir::IpuOptimizerExtractPass);
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuOptimizerExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using paddle::platform::ipu::IpuBackend;
+using framework::ir::Graph;
+using framework::ir::Node;
+
+void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = IpuBackend::GetInstance();
+  const auto* scope_ = ipu_backend->GetScope();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        auto inputs = node->Op()->Inputs();
+        if (inputs.count(platform::ipu::sBeta1Pow)) {
+          auto var = scope_->GetVar(inputs.at(platform::ipu::sBeta1Pow)[0]);
+          auto data = var->GetMutable<framework::LoDTensor>()->data<float>();
+          auto beta = BOOST_GET_CONST(
+              float, node->Op()->GetAttr(platform::ipu::sBeta1));
+
+          // ensure current save with beta1pow, rather than step.
+          // beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart
+          // support single Step__
+          bool save_with_beta1pow = (data[0] < 1.0f) && (data[0] > 0.0f);
+          float step = 0;
+          float beta_acc = beta;
+          while (beta_acc > data[0] && save_with_beta1pow) {
+            beta_acc *= beta;
+            step += 1;
+          }
+
+          if (save_with_beta1pow) {
+            data[0] = step;
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerStateAlignPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_state_align_pass,
+              paddle::framework::ir::IpuOptimizerStateAlignPass);
--- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * This pass should only affect optimizer that need bias correction,
+ * include Adam/Lamb.
+ */
+
+class IpuOptimizerStateAlignPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using platform::ipu::SymbolHandler;
+
+void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto nodes = graph->Nodes();
+  for (auto* node : nodes) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto* op = node->Op();
+    auto op_type = op->Type();
+
+    ir::Node* new_node = nullptr;
+    SymbolHandler handler = platform::ipu::GetHandler(op_type);
+    if (handler) {
+      VLOG(11) << "Raw Paddle Node:";
+      VLOG(11) << node->Op()->Proto()->DebugString();
+      new_node = handler(graph, node);
+      VLOG(11) << "Post Popart Node:";
+      VLOG(11) << new_node->Op()->Proto()->DebugString();
+
+      platform::ipu::ClearNode(node);
+      graph->RemoveNode(node);
+    } else {
+      LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type;
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(popart_canonicalization_pass,
+              paddle::framework::ir::PopartCanonicalizationPass);
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PopartCanonicalizationPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
      LOG(WARNING) << "Pass in op compat failed.";
      return;
    }
+
    const int kNumFields = 5;
    const int kTransOffset = 1;
    const int kTransOutOffset = 2;
    const int kFlattenOffset = 3;
    const int kFlattenOutOffset = 4;
-    std::vector<Node *> nodes;

+    std::vector<Node *> nodes;
+    std::vector<int> trans_axis0;
+    int flatten_axis0;
    for (int i = 0; i < times; i++) {
      PADDLE_ENFORCE_NOT_NULL(
          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
          platform::errors::NotFound("Can not find %s in subgraph.",
                                     input_nodes[i]->name()));

+      if (i == 0) {
+        trans_axis0 = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(0)))
+                ->Op()
+                ->GetAttr("axis"));
+        flatten_axis0 = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+      } else {
+        std::vector<int> trans_axis = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))
+                ->Op()
+                ->GetAttr("axis"));
+        // All axis of transpose should be the same
+        if (trans_axis0 != trans_axis) return;
+
+        int flatten_axis = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+        // All axis of flatten should be the same
+        if (flatten_axis0 != flatten_axis) return;
+      }
+
      nodes.push_back(subgraph.at(input_nodes[i]));
      nodes.push_back(
          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

+#define REGISTER_OP_IPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, IPU, ::paddle::platform::IPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)


--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
 cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
 cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)

 if (WITH_TESTING)
  cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)

--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -29,55 +29,32 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {

+using GraphHashStrategy = CinnCacheKey::GraphHashStrategy;
+
+CinnCacheKey::CinnCacheKey(GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {}
+
 CinnCacheKey::CinnCacheKey(
    const ir::Graph& graph,
    const std::map<std::string, const LoDTensor*>& input_tensors,
-    const std::string& arch_str) {
+    const std::string& arch_str, GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
  this->SetKey(graph, input_tensors, arch_str);
 }

 CinnCacheKey::CinnCacheKey(const ir::Graph& graph,
                           const std::map<std::string, DDim>& input_shapes,
-                           const std::string& arch_str) {
+                           const std::string& arch_str,
+                           GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
  this->SetKey(graph, input_shapes, arch_str);
 }

-size_t CinnCacheKey::HashGraph(const ir::Graph& graph) {
-  // using Dot to unqiue graph
-  inference::analysis::Dot dot;
-  std::unordered_map<const ir::Node*, std::string> node2dot;
-  int id = 0;
-  // Create nodes
-  // graph.Nodes() return unordered_set, the same graph may
-  // return different result?
-  for (const ir::Node* n : graph.Nodes()) {
-    std::string node_id = std::to_string(id++);
-    dot.AddNode(node_id, {}, n->Name(), true);
-    node2dot[n] = node_id;
-  }
-
-  // Create edges
-  for (const ir::Node* n : graph.Nodes()) {
-    const auto& src_id = node2dot.at(n);
-    for (auto* out : n->outputs) {
-      const auto& dest_id = node2dot.at(out);
-      dot.AddEdge(src_id, dest_id, {});
-    }
-  }
-
-  const std::string& viz_graph = dot.Build();
-  VLOG(1) << "The hash graph:\n" << viz_graph;
-
-  size_t hash_val = std::hash<std::string>()(viz_graph);
-  VLOG(4) << "The graph's hash value is: " << hash_val;
-  return hash_val;
-}
-
 void CinnCacheKey::SetKey(
    const ir::Graph& graph,
    const std::map<std::string, const LoDTensor*>& input_tensors,
    const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
  for (const auto& name_tensor : input_tensors) {
    input_shapes_[name_tensor.first] = name_tensor.second->dims();
  }
@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
 void CinnCacheKey::SetKey(const ir::Graph& graph,
                          const std::map<std::string, DDim>& input_shapes,
                          const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
  input_shapes_ = input_shapes;
  arch_str_ = arch_str;
 }
@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
 }

 bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
-  return graph_serialize_str_ == other.graph_serialize_str_ &&
+  return graph_hash_val_ == other.graph_hash_val_ &&
         input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }

@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
    ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
  }

-  ret = hash_combine(ret, string_hasher(key.graph_serialize_str_));
+  ret = hash_combine(ret, key.graph_hash_val_);
  ret = hash_combine(ret, string_hasher(key.arch_str_));
  return ret;
 }

+size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
+  // sort grad node by name and id.
+  auto compare = [](ir::Node* n1, ir::Node* n2) {
+    return (n1->Name() == n2->Name()) ? (n1->id() < n2->id())
+                                      : (n1->Name() < n2->Name());
+  };
+
+  // graph.Nodes() return unordered_set, here using set to avoid the same graph
+  // may return different result
+  std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
+      output_set(compare);
+  node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
+
+  std::string hash_str;
+  for (ir::Node* n : node_set) {
+    hash_str.append(n->Name());
+
+    output_set.clear();
+    output_set.insert(n->outputs.begin(), n->outputs.end());
+    for (auto* out : output_set) {
+      hash_str.append(out->Name());
+    }
+  }
+
+  VLOG(1) << "The hash graph:\n" << hash_str;
+
+  size_t hash_val = std::hash<std::string>()(hash_str);
+  VLOG(4) << "The graph's hash value by graph structure is: " << hash_val;
+  return hash_val;
+}
+
+size_t CinnCacheKeyByAddress::HashGraph(const ir::Graph& graph) {
+  size_t hash_val = reinterpret_cast<size_t>(&graph);
+  VLOG(4) << "The graph's hash value by graph address is: " << hash_val;
+  return hash_val;
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <functional>
 #include <map>

 #include "paddle/fluid/framework/ddim.h"
@@ -33,14 +34,18 @@ namespace paddle2cinn {
 // shapes.
 class CinnCacheKey {
 public:
+  using GraphHashStrategy = std::function<size_t(const ir::Graph&)>;
+
+  explicit CinnCacheKey(GraphHashStrategy graph_hash);
+
  CinnCacheKey(const ir::Graph& graph,
               const std::map<std::string, const LoDTensor*>& input_tensors,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
  CinnCacheKey(const ir::Graph& graph,
               const std::map<std::string, DDim>& input_shapes,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);

-  ~CinnCacheKey() {}
+  ~CinnCacheKey() = default;

  void SetKey(const ir::Graph& graph,
              const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -58,13 +63,38 @@ class CinnCacheKey {
  };

 private:
-  size_t HashGraph(const ir::Graph& graph);
-
-  std::string graph_serialize_str_;
+  GraphHashStrategy graph_hash_;
+  size_t graph_hash_val_;
  std::map<std::string, DDim> input_shapes_;
  std::string arch_str_;
 };

+#define CINN_CACHE_KEY_CREATE(NAME)                                    \
+  class NAME : public CinnCacheKey {                                   \
+   public:                                                             \
+    NAME() : CinnCacheKey(HashGraph) {}                                \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, const LoDTensor*>& input_tensors, \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {}   \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, DDim>& input_shapes,              \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {}    \
+                                                                       \
+   private:                                                            \
+    static size_t HashGraph(const ir::Graph& graph);                   \
+  };
+
+// Class to store the keys by graph address for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByAddress)
+// Class to store the keys by graph structure for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByStructure)
+
+#undef CINN_CACHE_KEY_CREATE
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -26,8 +26,8 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {

-TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
-  std::unordered_set<CinnCacheKey, CinnCacheKey::Hash> test_set;
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) {
+  std::unordered_set<CinnCacheKeyByStructure, CinnCacheKey::Hash> test_set;

  ProgramDesc empty_program;
  ir::Graph empty_graph(empty_program);
@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
  DDim ddim = paddle::framework::make_ddim({1, 2, 3});
  std::map<std::string, DDim> feed_shapes = {{"X", ddim}};

-  CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86");
-  CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key1(empty_graph, feed_shapes, "x86");
  EXPECT_EQ(cache_key0, cache_key1);

-  CinnCacheKey cache_key2(graph, feed_shapes, "x86");
-  CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu");
-  CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu");
+  CinnCacheKeyByStructure cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByStructure cache_key4(graph, feed_tensors, "nvgpu");
  EXPECT_NE(cache_key2, cache_key3);
  EXPECT_EQ(cache_key3, cache_key4);

-  CinnCacheKey cache_key5(empty_graph,
-                          std::map<std::string, const LoDTensor *>(), "unk");
-  CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>(), "unk");
+  CinnCacheKeyByStructure cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByStructure cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                     "unk");
  EXPECT_EQ(cache_key5, cache_key6);

  EXPECT_NE(cache_key1, cache_key3);
@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
  EXPECT_EQ(test_set.find(cache_key6), test_set.end());
 }

+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) {
+  std::unordered_set<CinnCacheKeyByAddress, CinnCacheKey::Hash> test_set;
+
+  ProgramDesc empty_program;
+  ir::Graph empty_graph(empty_program);
+
+  ProgramDesc program;
+  auto *global_block = program.MutableBlock(0);
+  auto *x = global_block->Var("X");
+  x->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph(program);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  DDim ddim = paddle::framework::make_ddim({1, 2, 3});
+  std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
+
+  CinnCacheKeyByAddress cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key1(empty_graph, feed_shapes, "x86");
+  EXPECT_EQ(cache_key0, cache_key1);
+
+  CinnCacheKeyByAddress cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByAddress cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByAddress cache_key4(graph, feed_tensors, "nvgpu");
+  EXPECT_NE(cache_key2, cache_key3);
+  EXPECT_EQ(cache_key3, cache_key4);
+
+  CinnCacheKeyByAddress cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByAddress cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                   "unk");
+  EXPECT_EQ(cache_key5, cache_key6);
+
+  EXPECT_NE(cache_key1, cache_key3);
+  EXPECT_NE(cache_key4, cache_key2);
+
+  EXPECT_NE(cache_key3, cache_key5);
+  EXPECT_NE(cache_key6, cache_key4);
+
+  EXPECT_NE(cache_key5, cache_key1);
+  EXPECT_NE(cache_key2, cache_key6);
+
+  test_set.insert(cache_key0);
+  test_set.insert(cache_key1);
+  test_set.insert(cache_key3);
+  test_set.insert(cache_key4);
+  test_set.insert(cache_key5);
+  test_set.insert(cache_key6);
+  EXPECT_EQ(test_set.size(), 3U);
+
+  auto iter = test_set.find(cache_key0);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 2U);
+  EXPECT_EQ(test_set.find(cache_key1), test_set.end());
+
+  iter = test_set.find(cache_key3);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 1U);
+  EXPECT_EQ(test_set.find(cache_key4), test_set.end());
+
+  iter = test_set.find(cache_key5);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 0U);
+  EXPECT_EQ(test_set.find(cache_key6), test_set.end());
+}
+
+TEST(CinnCacheKeyTest, TestSameGraph) {
+  ProgramDesc program1;
+  auto *global_block1 = program1.MutableBlock(0);
+  auto *x1 = global_block1->Var("X");
+  x1->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph1(program1);
+
+  ProgramDesc program2;
+  auto *global_block2 = program2.MutableBlock(0);
+  auto *x2 = global_block2->Var("X");
+  x2->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph2(program2);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  CinnCacheKeyByAddress cache_key_by_address1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key_by_address2(graph2, feed_tensors, "x86");
+  EXPECT_NE(cache_key_by_address1, cache_key_by_address2);
+
+  CinnCacheKeyByStructure cache_key_by_struct1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key_by_struct2(graph2, feed_tensors, "x86");
+  EXPECT_EQ(cache_key_by_struct1, cache_key_by_struct2);
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -41,6 +41,7 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"

@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
    const std::map<std::string, const LoDTensor*>& input_tensors,
    const Target& target, void* stream) {
  VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
-  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
+  CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors,
+                                           target.arch_str());
+  CinnCacheKeyByStructure cur_key_by_struct;
+
  bool exist = false;
  {
    AutoRDLock r_guard{&rwlock_};
-    exist = cache_.count(cur_key) != 0;
+    exist = cache_by_address_.count(cur_key_by_address) != 0;
+    // if cannot find graph by address, checkout whether the graph structure
+    // have been stored in cache.
+    if (!exist) {
+      // generate the structure cache key
+      cur_key_by_struct.SetKey(graph, input_tensors, target.arch_str());
+
+      // if the graph structure can be found, storing the graph address in
+      // cache for next query.
+      if (cache_by_struct_.count(cur_key_by_struct) != 0) {
+        exist = true;
+        cache_by_address_[cur_key_by_address] =
+            cache_by_struct_.at(cur_key_by_struct).get();
+      }
+    }
  }
  if (!exist) {
    std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
    auto compiled_res =
        CompileGraph(graph, input_tensors, target, compiled_num, stream);
    AutoWRLock w_guard{&rwlock_};
-    if (!cache_.count(cur_key)) {
-      cache_[cur_key] = std::move(compiled_res);
+    if (!cache_by_struct_.count(cur_key_by_struct)) {
+      cache_by_address_[cur_key_by_address] = compiled_res.get();
+      cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
    }
  }
  AutoRDLock guard{&rwlock_};
-  const auto& cached_boj = *cache_[cur_key];
+  const auto& cached_boj = *cache_by_address_[cur_key_by_address];
  return cached_boj;
 }

@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
  {
    AutoWRLock guard{&rwlock_};
    graphs_.clear();
-    cache_.clear();
+    cache_by_address_.clear();
+    cache_by_struct_.clear();
  }
  real_compiled_num_.store(0);
 }
@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
  *compiled_obj = {std::move(graph_compiler),
                   std::move(compiled_res.runtime_program), scope,
                   symbol.var_model_to_program_map()};
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(
+          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
  return compiled_obj;
 }


--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -31,6 +31,13 @@
 #include "paddle/fluid/platform/macros.h"

 namespace paddle {
+
+namespace operators {
+namespace details {
+class CinnLaunchContext;
+}  // namespace details
+}  // namespace operators
+
 namespace framework {
 namespace paddle2cinn {

@@ -39,6 +46,7 @@ struct CinnCompiledObject {
  std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
  std::shared_ptr<::cinn::hlir::framework::Scope> scope;
  std::unordered_map<std::string, std::string> paddle2cinn_varmap;
+  std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
 };

 // Entrance to use CINN.
@@ -87,9 +95,12 @@ class CinnCompiler {
      void* stream = nullptr) const;

  std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
-  std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
+  std::unordered_map<CinnCacheKeyByAddress, CinnCompiledObject*,
                     CinnCacheKey::Hash>
-      cache_;
+      cache_by_address_;
+  std::unordered_map<CinnCacheKeyByStructure,
+                     std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
+      cache_by_struct_;
  std::atomic_int64_t real_compiled_num_{0};
  mutable RWLock rwlock_;


--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <sstream>

 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"

 #include "paddle/fluid/framework/lod_tensor.h"
@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }

 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
-                         GetAttrsArgsNames(), GetOutputArgsNames());
+  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
+                         GetInputArgsNames(), GetAttrsArgsNames(),
+                         GetOutputArgsNames());
 }

 std::string KernelSignatureToString(const KernelSignature& signature) {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
  }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_ipu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  }
+#endif
+
 #ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
@@ -386,17 +402,33 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
  }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    }
    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else {  // NOLINT
+  }
+  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
        platform::errors::Unimplemented("Not supported on place (%s) ", npu));
    // return GetResultHelper(out, npu);
  }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not supported on place (%s) ", ipu));
+  }

  bool GetResult(const framework::Tensor& out,
                 const platform::NPUPinnedPlace& cpu) const {
@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
  void VisitorImpl(const platform::XPUPlace& xpu) const {
    PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
  }
+  void VisitorImpl(const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+  }

  void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
        "is not supported in imperative mode",
        place));
  }
+  // there is NO support in IPUPlace
+  void operator()(const platform::IPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }

 private:
  int64_t numel_;

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
        op.Type(), outs, dev_ctx->GetPlace());
  }

+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
  /**
   * [ Why need handle complex gradient to real gradient? ]
   *

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -211,70 +211,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif

-// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
-// If later the operators::StridedMemcpyWithAxis0 is supported,
-// then this specific SplitTensorsForAllReduce can be removed.
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-void SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors) {
-  auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
-  std::vector<framework::Tensor *> outs;
-  std::vector<const framework::Tensor *> shape_refer;
-
-  outs.reserve(p_dense_tensors->size());
-  shape_refer.reserve(p_dense_tensors->size());
-
-  for (auto &tensor : *p_dense_tensors) {
-    outs.emplace_back(&tensor);
-    shape_refer.emplace_back(&tensor);
-  }
-  operators::math::SplitFunctor<platform::NPUDeviceContext, float>
-      split_functor_;
-  split_functor_(context, *in, shape_refer, 0, &outs);
-}
-
-template <>
-void ConcatTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
-    framework::Variable *p_dense_contents,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      ConcatTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, dense_tensors_, p_dense_contents);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it concats tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-
-template <>
-void SplitTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, p_dense_contents, p_dense_tensors);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it splits tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-#endif
-
 void Group::ConcatTensors(const platform::DeviceContext &context) {
  auto place = context.GetPlace();
  if (platform::is_gpu_place(place)) {

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {

  const AllocatorMap& GetAllocatorMap() {
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
      auto id = platform::CUDAGraph::CapturingID();
      auto iter = cuda_graph_allocator_map_.find(id);
      PADDLE_ENFORCE_NE(
          iter, cuda_graph_allocator_map_.end(),
          platform::errors::PermissionDenied(
              "No memory pool is prepared for CUDA Graph capturing."));
+      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
      return iter->second->allocators_;
    } else {
      return allocators_;
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_HIP)
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
+        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
 #endif

 #if defined(PADDLE_WITH_CUDA)

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
 }

+// For Graphcore IPU
+template <>
+void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "IPUPlace, Allocate on cpu.";
+
+  void *p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+template <>
+void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+                              size_t size) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+template <>
+uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Release();
+}
+template <>
+size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
 // For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {

--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -32,9 +32,34 @@ CinnLaunchContext::CinnLaunchContext(
      [](const auto& name_view) { return std::string(name_view.data()); });
 }

-bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
-  return paddle2cinn_varmap_.count(paddle_name) > 0 &&
-         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
+void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
+                                          const platform::Place& place) {
+  if (std::addressof(scope) == cached_scope_ &&
+      std::addressof(place) == cached_place_) {
+    VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
+            << " are not changed";
+    return;
+  }
+  cached_scope_ = std::addressof(scope);
+  cached_place_ = std::addressof(place);
+  cached_temp_scope_ = scope.NewTmpScope();
+  VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
+          << std::addressof(scope) << ", place:" << cached_place_ << "->"
+          << std::addressof(place);
+}
+
+bool CinnLaunchContext::IsArgumentsInitialized() const {
+  if (hold_buffers_.empty() || name2argument_.empty()) {
+    return false;
+  }
+  return true;
+}
+
+bool CinnLaunchContext::IsVariableUsed(
+    const std::string& paddle_var_name) const {
+  return paddle2cinn_varmap_.count(paddle_var_name) > 0 &&
+         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_var_name)) >
+             0;
 }

 CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& var_name) {
@@ -53,99 +78,101 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
  return all_parameters;
 }

-void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& paddle_var_name, const LoDTensor& paddle_tensor,
+    const CinnTensor& cinn_tensor) {
  // check dimension
  auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
  PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                    platform::errors::PreconditionNotMet(
                        "Tensors' shape in variable(%s) are not equivalent, "
                        "paddle's shape = [%s], but cinn's shape = [%s].",
-                        paddle_name, paddle_tensor.dims(), cinn_dims));
+                        paddle_var_name, paddle_tensor.dims(), cinn_dims));

  // TODO(CtfGo): check the underlying data type after CINN ready
 }

-void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
-                    platform::errors::InvalidArgument(
-                        "Paddle variable(%s) not used by cinn", paddle_name));
-
-  const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+void CinnLaunchContext::AssignExternalVariable(
+    const std::string& paddle_var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(paddle_var_name), true,
+      platform::errors::InvalidArgument("Paddle variable(%s) not used by cinn",
+                                        paddle_var_name));
+
+  const auto& cinn_var_name = paddle2cinn_varmap_.at(paddle_var_name);
+  const auto& paddle_tensor =
+      cached_scope_->GetVar(paddle_var_name)->Get<LoDTensor>();
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
+  if (paddle_tensor.IsInitialized()) {
+    CheckTensorEquivalent(paddle_var_name, paddle_tensor, cinn_tensor);
  }
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
-                     paddle_tensor);
-}

-void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn socpe.", cinn_name));
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
-  }
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
-                     paddle_tensor);
-}
+  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [this, paddle_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_scope_->GetVar(paddle_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
+        return 0;
+      });

-std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
-  // convert paddle dimensions array to cinn format
-  std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
-  for (auto i = 0; i < tensor->dims().size(); ++i) {
-    cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
-  }
+  // external variables will be recycled by global gc, so do nothing here
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });

+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
+}
+
+void CinnLaunchContext::AssignInternalVariable(
+    const std::string& cinn_var_name) {
+  PADDLE_ENFORCE_GT(
+      cinn_variable_names_.count(cinn_var_name), 0,
+      platform::errors::InvalidArgument("Variable(%s) not found in cinn socpe.",
+                                        cinn_var_name));
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign size and memory
-  cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());

  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
-        buffer->memory =
-            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->Var(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
        return 0;
      });

-  if (free_mem_callback) {
-    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-        [tensor](void* ctx, cinn_buffer_t* buffer) {
-          tensor->clear();
-          return 0;
-        });
-    return cinn_buffer;
-  }
-
+  // internal variables should release its buffer immediately
+  // if no instruction use it
  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-      [](void* ctx, cinn_buffer_t* buffer) {
-        // Do nothing
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->GetVar(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->clear();
        return 0;
      });
-  return cinn_buffer;
+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
 }

-void CinnLaunchContext::SetArgument(const std::string& cinn_name,
-                                    const platform::Place& place,
-                                    bool free_mem_callback,
-                                    LoDTensor* paddle_tensor) {
-  auto buffer =
-      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
-  name2argument_.emplace(cinn_name, buffer.get());
+void CinnLaunchContext::SetArgument(const std::string& cinn_var_name,
+                                    std::unique_ptr<cinn_buffer_t>&& buffer) {
+  VLOG(4) << "SetArgument-" << name2argument_.size() << ": name("
+          << cinn_var_name << "), dims("
+          << framework::DDim(buffer->dims, buffer->dimensions) << ").";
+
+  name2argument_.emplace(cinn_var_name, buffer.get());
  hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
-          << ").";
 }

 const std::map<std::string, cinn_pod_value_t>&

--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -24,7 +24,7 @@
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/scope.h"

 namespace paddle {
 namespace operators {
@@ -40,16 +40,22 @@ class CinnLaunchContext {
      const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
      const std::shared_ptr<CinnScope>& cinn_scope);

+  // explicitly update several environment variables captured
+  // by callback of execution arguments
+  void UpdateCapturedEnv(const framework::Scope& scope,
+                         const platform::Place& place);
+
+  // Return whether execution arguments has been initialized
+  bool IsArgumentsInitialized() const;
+
  // Return whether a Paddle variable used on compiled kernels
-  bool IsVariableUsed(const std::string& var_name);
+  bool IsVariableUsed(const std::string& paddle_var_name) const;

  // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& paddle_var_name);

  // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& cinn_var_name);

  // Extract internal variable names from CinnScope
  // by excluding used input and output variables
@@ -58,10 +64,6 @@ class CinnLaunchContext {
  // Finalize all execution arguments and return them
  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;

-  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
-    return std::move(hold_buffers_);
-  }
-
 private:
  // Get CinnTensor with CINN variable name
  CinnTensor GetCinnTensor(const std::string& var_name);
@@ -72,16 +74,15 @@ class CinnLaunchContext {
                             const LoDTensor& paddle_tensor,
                             const CinnTensor& cinn_tensor);

-  // Share the buffer of a Paddle tensor to CINN by delivering memory address
-  // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
-      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
-
-  // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, const platform::Place& place,
-                   bool free_mem_callback, LoDTensor* paddle_tensor);
+  // Set an argument with (cinn name)->(cinn_buffer_t) pair
+  void SetArgument(const std::string& cinn_var_name,
+                   std::unique_ptr<cinn_buffer_t>&& buffer);

 private:
+  const framework::Scope* cached_scope_ = nullptr;
+  const platform::Place* cached_place_ = nullptr;
+  std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
+
  // a variable name map from paddle to cinn
  const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
  // the variable scope of cinn

--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
  return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
 }

-TEST(CinnLaunchContextTest, TestIsVariableUsed) {
+TEST(CinnLaunchContextTest, TestBasic) {
  auto launch_context = CreateDefaultLaunchContext();
-
+  // test IsVariableUsed
  ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
  ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-}
-
-TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
-  auto launch_context = CreateDefaultLaunchContext();
-  auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 3);
-  EXPECT_NE(internal_variable_names.find("cinn_var2"),
-            internal_variable_names.end());
+  // test UpdateCapturedEnv
+  platform::CPUPlace place;
+  framework::Scope scope;
+  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
+  // test IsArgumentsInitialized
+  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
 }

 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
-  auto launch_context = CreateDefaultLaunchContext();
  platform::CPUPlace place;
  framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();

  // CheckTensorEquivalent: tensor dimension not equivalent
  tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
               paddle::platform::EnforceNotMet);
 }

 TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
-  auto launch_context = CreateDefaultLaunchContext();
  platform::CPUPlace place;
  framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
  auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();

  // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
               paddle::platform::EnforceNotMet);
  // not found
-  ASSERT_THROW(
-      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
-      paddle::platform::EnforceNotMet);
+  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
+               paddle::platform::EnforceNotMet);
 }

 TEST(CinnLaunchContextTest, TestSetArgument) {
+  platform::CPUPlace cpu_place;
+  platform::Place place(cpu_place);
+  framework::Scope scope;
  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);

-  platform::CPUPlace place;
-  framework::Scope scope;
+  // assign external variables
  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
  float* data1 =
      tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
  data1[0] = 9.99f;
  data1[10] = 19.99f;
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));

-  // assign external variable
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var1", place, tensor1));
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
-  // FinalizeArguments not missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
  auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
  tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var3", place, tensor3));
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
+
+  // FinalizeArguments missed check
+  ASSERT_THROW(launch_context->FinalizeArguments(),
+               paddle::platform::EnforceNotMet);
+  // test get internal variables
+  auto internal_variable_names = launch_context->GetInternalVariableNames();
+  ASSERT_EQ(internal_variable_names.size(), 1);
+  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");

+  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
+  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
+  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
+
+  // check argument is set correctly and alloc/free callbacks work well
  auto name2argument = launch_context->FinalizeArguments();
  ASSERT_EQ(name2argument.size(), 3);
  ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  // check ShareTensorWithCinnBuffer
+  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
+
  auto* cinn_buffer =
      static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
-
  ASSERT_EQ(cinn_buffer->memory, nullptr);
  cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
  ASSERT_NE(cinn_buffer->memory, nullptr);

--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -31,26 +31,6 @@ namespace operators {
 namespace details {

 #ifdef PADDLE_WITH_CUDA
-void CUDART_CB ReleaseScope(void* data) {
-  auto* temp_scope = static_cast<framework::Scope*>(data);
-  delete temp_scope;
-}
-
-void CUDART_CB ReleaseBuffers(void* data) {
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
-  delete buffers;
-}
-
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
-}
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx) {

--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();

-template <typename DeviceContext>
-void ReleaseResource(const std::vector<void*>& resources, void* stream) {
-  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
-  delete temp_scope;
-  delete buffers;
-}
-
 template <typename DeviceContext>
 void* GetStream(const framework::ExecutionContext& ctx) {
  return nullptr;
 }

 #ifdef PADDLE_WITH_CUDA
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream);
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx);
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
        compilation_key, inputs_name2tensor, target, stream);
    details::DebugCinnCompiledResult(cinn_compiled_object);

-    auto launch_context = std::make_unique<details::CinnLaunchContext>(
-        cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
-
+    auto* launch_context = cinn_compiled_object.launch_context.get();
    // Step 3. Prepare arguments needed for the compiled executable program.
-    VLOG(4) << "CinnLaunchOp prepare arguments";
-
-    // 3.1 Prepare input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_variable_names) {
-      if (!launch_context->IsVariableUsed(var_name)) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
-        continue;
+    launch_context->UpdateCapturedEnv(scope, place);
+    if (!launch_context->IsArgumentsInitialized()) {
+      VLOG(4) << "CinnLaunchOp prepare arguments";
+
+      // 3.1 Prepare input variables: tensors of input variables have
+      //     been initialized before graph compiled, just check the
+      //     equiality between tensors of paddle and cinn.
+      for (const auto& var_name : input_variable_names) {
+        if (!launch_context->IsVariableUsed(var_name)) {
+          // some input variables don't need for cinn because they are
+          // eliminated by optimized passes or some cinn operators use
+          // less variables
+          VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
+          continue;
+        }
+
+        launch_context->AssignExternalVariable(var_name);
      }

-      launch_context->AssignExternalVariable(
-          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
-    }
-
-    // 3.2 Prepare output variables: all output variables should
-    //     be initialized and allocated buffer before
-    //     the runtime program start execution, the compilation result
-    //     includes details of their buffer assginment and we use that to
-    //     allocate space in Paddle. For those variables allocated yet,
-    //     like persistable parameters, just check the equiality between
-    //     Paddle allocation and CINN buffer assginment.
-    auto output_variable_names = ctx.OutputNames(kOutputs);
-    for (const auto var_name : output_variable_names) {
-      PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
-                        platform::errors::InvalidArgument(
-                            "Output variable(%s) not used by cinn", var_name));
-
-      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignExternalVariable(var_name, place, tensor);
-    }
+      // 3.2 Prepare output variables: all output variables should
+      //     be initialized and allocated buffer before
+      //     the runtime program start execution, the compilation result
+      //     includes details of their buffer assginment and we use that to
+      //     allocate space in Paddle. For those variables allocated yet,
+      //     like persistable parameters, just check the equiality between
+      //     Paddle allocation and CINN buffer assginment.
+      auto output_variable_names = ctx.OutputNames(kOutputs);
+      for (const auto var_name : output_variable_names) {
+        PADDLE_ENFORCE_EQ(
+            launch_context->IsVariableUsed(var_name), true,
+            platform::errors::InvalidArgument(
+                "Output variable(%s) not used by cinn", var_name));
+
+        launch_context->AssignExternalVariable(var_name);
+      }

-    // 3.3 Prepare internal or temporary variables: Create a temporary
-    //     scope to keep internal variables within graph or temporary
-    //     variables needed by the compiled runtime program in addition.
-    //     Here we directly use the names from CinnScope as Paddle variable
-    //     names, because they will not be used outside the graph
-    //     and should be destructed after computation finished.
-    auto internal_variable_names = launch_context->GetInternalVariableNames();
-    framework::Scope* temp_scope = scope.NewTmpScope().release();
-    for (const auto& var_name : internal_variable_names) {
-      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignInternalVariable(var_name, place, tensor);
+      // 3.3 Prepare internal or temporary variables: Create a temporary
+      //     scope to keep internal variables within graph or temporary
+      //     variables needed by the compiled runtime program in addition.
+      //     Here we directly use the names from CinnScope as Paddle variable
+      //     names, because they will not be used outside the graph
+      //     and should be destructed after computation finished.
+      auto internal_variable_names = launch_context->GetInternalVariableNames();
+      for (const auto& var_name : internal_variable_names) {
+        launch_context->AssignInternalVariable(var_name);
+      }
    }

    // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
    VLOG(4) << "CinnLaunchOp launch execution done.";
-
-    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
-    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
-        launch_context->HandoverBuffers()};
-    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
-                                            stream);
  }
 };


--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
    scope.Var(test_out_name)->GetMutable<LoDTensor>();
    scope.Var(expected_out_name)->GetMutable<LoDTensor>();

-    cinn_launch_op->Run(scope, place);
-    elementwise_add_op->Run(scope, place);
+    platform::Place run_place(place);
+    cinn_launch_op->Run(scope, run_place);
+    elementwise_add_op->Run(scope, run_place);

    LoDTensor test_out, expected_out;
    TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),

--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+class AsComplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_complex");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_complex");
+
+    auto in_dims = ctx->GetInputDim("X");
+    const int input_rank = in_dims.size();
+    PADDLE_ENFORCE_GE(
+        input_rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of input(X) is less than 1. "
+            "Expected the rank of input(X) to be equal to or greater than 1."
+            "But received rank of input(X) = %d",
+            input_rank));
+    const int last_dim_size = in_dims[input_rank - 1];
+    PADDLE_ENFORCE_EQ(
+        last_dim_size, 2,
+        platform::errors::InvalidArgument(
+            "The size of the last dimension of input(X)"
+            "does not equals 2."
+            "Expected the size of last dimension of input(X) to be 2."
+            "But received %d",
+            last_dim_size));
+
+    const framework::DDim out_dims(in_dims.Get(), input_rank - 1);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AsComplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of view_as_complex op.");
+    AddOutput("Out", "(Tensor), The output tensor of view_as_complex op.");
+    AddComment(R"DOC(
+As_complex Operator.
+
+This operator is used to return a complex tensor represented
+by an old-fashioned real tensor. The size of the last dimension of 
+the input tensor should be 2, which corresponds to 'real' and 
+'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsComplexGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_real");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+class AsRealOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_real");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_real");
+
+    auto out_dims_v = framework::vectorize(ctx->GetInputDim("X"));
+    out_dims_v.push_back(2);
+    const framework::DDim out_dims = framework::make_ddim(out_dims_v);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(framework::ToRealType(input_data_type),
+                                   ctx.GetPlace());
+  }
+};
+
+class AsRealOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of as_real op.");
+    AddOutput("Out", "(Tensor), The output tensor of as_real op.");
+    AddComment(R"DOC(
+AsReal Operator.
+
+This operator is used to return an old-fashioned real tensor from a 
+complex tensor. The size of the last dimension of the output tensor is 2,
+which corresponds to 'real' and 'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsRealGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_complex");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(as_complex, ops::AsComplexOp, ops::AsComplexOpMaker,
+                  ops::AsComplexGradMaker<paddle::framework::OpDesc>,
+                  ops::AsComplexGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(as_real, ops::AsRealOp, ops::AsRealOpMaker,
+                  ops::AsRealGradMaker<paddle::framework::OpDesc>,
+                  ops::AsRealGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_complex, ops::AsComplexKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/complex_view_op.cu
+++ b/paddle/fluid/operators/complex_view_op.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    as_complex,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/complex_view_op.h
+++ b/paddle/fluid/operators/complex_view_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AsComplexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<platform::complex<T>>(context.GetPlace());
+
+    // TensorCopy also changes output's shape & dtype
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);  // restored the shape
+    out->mutable_data<platform::complex<T>>(
+        context.GetPlace());  // restore the dtype
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AsRealKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);            // restored the shape
+    out->mutable_data<T>(context.GetPlace());  // restore the dtype
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ipu_runtime_op.cc
+++ b/paddle/fluid/operators/ipu_runtime_op.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/ipu_runtime_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IpuRuntimeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FeedList", "FeedList of Graph").AsDuplicable();
+    AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddComment(R"DOC(
+Run graph by PopART runtime.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
+
+REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel<float>,
+                       ops::IpuRuntimeKernel<double>,
+                       ops::IpuRuntimeKernel<int>,
+                       ops::IpuRuntimeKernel<int64_t>,
+                       ops::IpuRuntimeKernel<bool>,
+                       ops::IpuRuntimeKernel<int8_t>,
+                       ops::IpuRuntimeKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/ipu_runtime_op.h
+++ b/paddle/fluid/operators/ipu_runtime_op.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/framework/ipu/ipu_backend.h"
+#include "paddle/fluid/framework/tensor.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class IpuRuntimeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef PADDLE_WITH_IPU
+    auto ipu_backend = framework::ipu::IpuBackend::GetInstance();
+    if (!ipu_backend->DeviceIsAttached()) {
+      const platform::IPUDeviceContext& ipu_ctx =
+          reinterpret_cast<const platform::IPUDeviceContext&>(
+              ctx.device_context());
+      ipu_backend->AttachDevice(ipu_ctx.DeviceId());
+    }
+
+    auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
+    auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
+    auto output_names = ctx.OutputNames("FetchList");
+    VLOG(4) << "IpuRuntime Kernel, begin to run graph";
+    ipu_backend->Run(inputs, outputs, ctx);
+
+    // post-run
+    // resize tensor when tensor.dims() is empty
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto* out = outputs[i];
+      if (out->dims().size() == 0) {
+        auto tensor_dtype = out->type();
+        auto sizeof_dtype = framework::SizeOfType(tensor_dtype);
+        int64_t dim = out->memory_size() / sizeof_dtype;
+        out->Resize({dim});
+        VLOG(10) << "set ipu_runtime_op output: " << output_names[i]
+                 << " dims from () to: "
+                 << "(" << dim << ")";
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Please compile WITH_IPU option to enable ipu_runtime op"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }

+template <>
+void set_constant_with_place<platform::IPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,

--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -54,7 +54,7 @@ class PyLayerOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    auto data_type = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
    return framework::OpKernelType(data_type, ctx.device_context());
  }


--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
      const framework::ExecutionContext &ctx) const override {
    auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
    if (multi_inputs.size() > 0) {
-      return framework::KernelSignature("reshape.mulhost", {"X", "ShapeTensor"},
+      return framework::KernelSignature("reshape_mulhost", {"X", "ShapeTensor"},
                                        {}, {"Out"});
    } else if (ctx.HasInput("Shape")) {
-      return framework::KernelSignature("reshape.host", {"X", "Shape"}, {},
+      return framework::KernelSignature("reshape_host", {"X", "Shape"}, {},
                                        {"Out"});
    } else {
      return framework::KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});

--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace platform {

 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
+paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};

 void CUDAGraph::Reset() {
  if (is_reset_) return;
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
      IsCapturing(), true,
      errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
                               "Graph is capturing."));
+  if (IsThreadLocalCapturing()) {
+    PADDLE_ENFORCE_EQ(IsThisThreadCapturing(), true,
+                      platform::errors::PermissionDenied(
+                          "When capturing CUDA Graph in the thread local mode, "
+                          "you cannot begin segmented capturing in the thread "
+                          "which is not the one that starts the capturing."));
+  }
  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
      capturing_graph_->stream_, capturing_graph_->capture_mode_));
  PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
  capturing_graph_->place_ = place;
  capturing_graph_->stream_ = stream;
  capturing_graph_->capture_mode_ = mode;
+  if (mode == cudaStreamCaptureModeThreadLocal) {
+    capturing_thread_id_ = std::this_thread::get_id();
+    VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: "
+             << capturing_thread_id_;
+  }
  BeginSegmentCapture();
 #endif
 }
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {

 std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
  EndSegmentCapture();
+  capturing_thread_id_ = paddle::none;
  return std::move(capturing_graph_);
 }


--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <thread>
 #include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
@@ -26,6 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/utils/optional.h"

 namespace paddle {
 namespace platform {
@@ -99,6 +101,25 @@ class CUDAGraph {
  // supported during capturing CUDA Graph.
  static bool IsValidCapturing();

+  static bool IsThreadLocalCapturing() {
+#if CUDA_VERSION >= 10010
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
+
 private:
  static CUDAGraphID UniqueID() {
    static std::atomic<CUDAGraphID> id;
@@ -118,6 +139,7 @@ class CUDAGraph {
  bool is_reset_{false};
  std::mutex mtx_;

+  static paddle::optional<std::thread::id> capturing_thread_id_;
  static std::unique_ptr<CUDAGraph> capturing_graph_;
 };


--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }

+#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
+  return *reinterpret_cast<float16 *>(&x);
+}
+
+static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
+  return *reinterpret_cast<__half *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  return CUDAFP16ToPDFP16(
+      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
+}
+#else
 CUDA_ATOMIC_WRAPPER(Add, float16) {
  // concrete packed float16 value may exsits in lower or higher 16bits
  // of the 32bits address.
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
  }
 }
 #endif
+#endif

 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
  float *real = reinterpret_cast<float *>(address);

--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
-# IPU
 IF(WITH_IPU)
+  FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)

--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) {
+  auto new_node = CreateBaseOp(graph, node, type, {GetInputVarNode("X", node)},
+                               node->outputs);
+  return new_node;
+}
+
+Node *relu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_relu");
+}
+
+Node *tanh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_tanh");
+}
+
+Node *log_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_log");
+}
+
+Node *sigmoid_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sigmoid");
+}
+
+Node *sqrt_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sqrt");
+}
+
+Node *gelu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_gelu_v2");
+}
+
+Node *log_softmax_handler(Graph *graph, Node *node) {
+  auto axis = BOOST_GET_CONST(int, node->Op()->GetAttr("axis"));
+  auto new_softmax = CreateSoftmaxOpset11(graph, node, node->inputs, {}, axis);
+  return CreateBaseOp(graph, node, "popart_log", new_softmax->outputs,
+                      node->outputs);
+}
+
+REGISTER_HANDLER(relu, relu_handler);
+REGISTER_HANDLER(tanh, tanh_handler);
+REGISTER_HANDLER(log, log_handler);
+REGISTER_HANDLER(sigmoid, sigmoid_handler);
+REGISTER_HANDLER(sqrt, sqrt_handler);
+REGISTER_HANDLER(gelu, gelu_handler);
+REGISTER_HANDLER(log_softmax, log_softmax_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// This avoids the static initialisation order fiasco,
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
+  static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
+  return symbol_handlers;
+}
+
+bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
+  if (SymbolHandlers().count(symbol) != 0) {
+    LOG(WARNING) << "Trying to register popart handler twice for operator: "
+                 << symbol;
+    return false;
+  }
+  bool new_handler = SymbolHandlers().emplace(symbol, handler).second;
+  return new_handler;
+}
+
+// Return a pointer to a handler if one is registered for this kind of node or
+// an empty std::function otherwise.
+SymbolHandler GetHandler(const std::string &kind) {
+  auto it = SymbolHandlers().find(kind);
+  if (it != SymbolHandlers().end()) {
+    return it->second;
+  }
+  return {};
+}
+
+void ConnectNodes(Node *first_node, Node *next_node) {
+  first_node->outputs.push_back(next_node);
+  next_node->inputs.push_back(first_node);
+}
+
+void DisConnectNodes(Node *first_node, Node *next_node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  rm_by_value(first_node->outputs, next_node);
+  rm_by_value(next_node->inputs, first_node);
+  rm_by_value(first_node->inputs, next_node);
+  rm_by_value(next_node->outputs, first_node);
+}
+
+void ClearNode(Node *node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  for (auto *node_in : node->inputs) {
+    rm_by_value(node_in->outputs, node);
+  }
+  for (auto *node_out : node->outputs) {
+    rm_by_value(node_out->inputs, node);
+  }
+}
+
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override) {
+  if (new_op->HasAttr(attr_name) && !override) {
+    return;
+  }
+  if (op->HasAttr(attr_name)) {
+    VLOG(10) << "Copying attr: " << attr_name << " from " << op->Type()
+             << " to " << new_op->Type();
+    new_op->SetAttr(attr_name, op->GetAttr(attr_name));
+    new_op->Flush();
+  }
+}
+
+const int VarType2OnnxDtype(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::BOOL:
+      return static_cast<int>(ONNXDataType::BOOL);
+    case framework::proto::VarType::INT16:
+      return static_cast<int>(ONNXDataType::INT16);
+    case framework::proto::VarType::INT32:
+      return static_cast<int>(ONNXDataType::INT32);
+    case framework::proto::VarType::INT64:
+      return static_cast<int>(ONNXDataType::INT64);
+    case framework::proto::VarType::FP16:
+      return static_cast<int>(ONNXDataType::FLOAT16);
+    case framework::proto::VarType::FP32:
+      return static_cast<int>(ONNXDataType::FLOAT);
+    case framework::proto::VarType::FP64:
+      return static_cast<int>(ONNXDataType::DOUBLE);
+    case framework::proto::VarType::UINT8:
+      return static_cast<int>(ONNXDataType::UINT8);
+    case framework::proto::VarType::INT8:
+      return static_cast<int>(ONNXDataType::INT8);
+    case framework::proto::VarType::BF16:
+      return static_cast<int>(ONNXDataType::BFLOAT16);
+    case framework::proto::VarType::COMPLEX64:
+      return static_cast<int>(ONNXDataType::COMPLEX64);
+    case framework::proto::VarType::COMPLEX128:
+      return static_cast<int>(ONNXDataType::COMPLEX128);
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
+  }
+}
+
+const std::string VarType2PopStr(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::UINT8:
+      return "UINT8";
+    case framework::proto::VarType::INT8:
+      return "INT8";
+    case framework::proto::VarType::INT16:
+      return "INT16";
+    case framework::proto::VarType::INT32:
+      return "INT32";
+    case framework::proto::VarType::INT64:
+      return "INT64";
+    case framework::proto::VarType::BOOL:
+      return "BOOL";
+    case framework::proto::VarType::FP64:
+      return "DOUBLE";
+    case framework::proto::VarType::FP32:
+      return "FLOAT";
+    case framework::proto::VarType::FP16:
+      return "FLOAT16";
+    default:
+      PADDLE_THROW(
+          paddle::platform::errors::Unavailable("Unsupported data type."));
+  }
+}
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id) {
+  auto var_name = op_node->Op()->Input(input_name).at(id);
+  return GetInputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id) {
+  auto var_name = op_node->Op()->Output(output_name).at(id);
+  return GetOutputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node) {
+  for (auto *var : op_node->inputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node) {
+  for (auto *var : op_node->outputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+const bool is_float_equal(float a, float b, float eps) {
+  return std::fabs(a - b) <= eps;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::OpDesc;
+
+#define REGISTER_HANDLER(name, func) \
+  static bool __UNUSED_##name =      \
+      paddle::platform::ipu::RegisterHandler(#name, func)
+
+using SymbolHandler = std::function<Node *(Graph *, Node *)>;
+
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers();
+
+bool RegisterHandler(const std::string &, const SymbolHandler &);
+
+SymbolHandler GetHandler(const std::string &);
+
+void ConnectNodes(Node *first_node, Node *next_node);
+void DisConnectNodes(Node *first_node, Node *next_node);
+void ClearNode(Node *node);
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override = false);
+
+const int VarType2OnnxDtype(const int type);
+const std::string VarType2PopStr(const int type);
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id = 0);
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id = 0);
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node);
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node);
+
+const bool is_float_equal(float a, float b, float eps = 1e-8);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *elementwise_op_handler(Graph *graph, Node *node,
+                             const std::string &type) {
+  auto *op = node->Op();
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  int64_t x_rank = x_shape.size();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+  int64_t y_rank = y_shape.size();
+
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  if (axis == -1 || axis == x_rank - 1 || x_rank == y_rank) {
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                     node->outputs);
+    return new_node;
+  } else {
+    auto y_new_shape = std::vector<int64_t>(x_rank, 1);
+    for (int i = axis; i < axis + y_rank; ++i) {
+      y_new_shape[i] = y_shape[i - axis];
+    }
+    auto attrs = AttributeMap{
+        {"value", y_new_shape},
+        {"dims", std::vector<int64_t>{x_rank}},
+        {"dtype", ONNXDataType::INT64},
+    };
+    // constant
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    // reshape
+    auto new_node_reshape = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {GetInputVarNode("Y", node), new_node_const->outputs[0]}, {});
+    // elementwise_op
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), new_node_reshape->outputs[0]},
+                     node->outputs);
+    return new_node;
+  }
+}
+
+Node *elementwise_add_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_add");
+}
+
+Node *elementwise_sub_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_sub");
+}
+
+Node *elementwise_div_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_div");
+}
+
+Node *elementwise_mul_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mul");
+}
+
+Node *elementwise_min_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_min");
+}
+
+Node *elementwise_max_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_max");
+}
+
+Node *elementwise_pow_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_pow");
+}
+
+Node *elementwise_mod_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mod");
+}
+
+REGISTER_HANDLER(elementwise_add, elementwise_add_handler);
+REGISTER_HANDLER(elementwise_sub, elementwise_sub_handler);
+REGISTER_HANDLER(elementwise_div, elementwise_div_handler);
+REGISTER_HANDLER(elementwise_mul, elementwise_mul_handler);
+REGISTER_HANDLER(elementwise_min, elementwise_min_handler);
+REGISTER_HANDLER(elementwise_max, elementwise_max_handler);
+REGISTER_HANDLER(elementwise_pow, elementwise_pow_handler);
+REGISTER_HANDLER(elementwise_mod, elementwise_mod_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *equal_handler(Graph *graph, Node *node) {
+  auto new_node = CreateBaseOp(
+      graph, node, "popart_equal",
+      {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs);
+  return new_node;
+}
+
+REGISTER_HANDLER(equal, equal_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *mean_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_reducemean",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)},
+                      {
+                          {"keepdims", int64_t{0}},
+                      });
+}
+
+Node *pow_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
+    return CreateBaseOp(
+        graph, node, "popart_pow",
+        {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
+        node->outputs);
+  } else {
+    // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
+    auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
+    auto attrs =
+        MakeConstAttrMapFromValue<float>(value_, {1}, ONNXDataType::FLOAT);
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
+                                                    new_node_const->outputs[0]},
+                        node->outputs);
+  }
+}
+
+Node *mul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto x_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("x_num_col_dims"));
+  auto y_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("y_num_col_dims"));
+  auto x_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape_ = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  // build the shape for reshape
+  std::vector<int64_t> reshape_shape_{};
+  for (int left = 0; left < x_num_col_dims; left++) {
+    reshape_shape_.push_back(int64_t(x_shape_[left]));
+  }
+  for (int right = y_num_col_dims; right < y_shape_.size(); right++) {
+    reshape_shape_.push_back(int64_t(y_shape_[right]));
+  }
+  auto x_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("X", node)},
+                   {}, {{"axis", int64_t(x_num_col_dims)}});
+  auto y_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("Y", node)},
+                   {}, {{"axis", int64_t(y_num_col_dims)}});
+  auto matmul =
+      CreateBaseOp(graph, node, "popart_matmul",
+                   {x_flatten->outputs[0], y_flatten->outputs[0]}, {}, {});
+
+  auto reshape_const = CreateConst(
+      graph, node, {}, {},
+      {{"value", reshape_shape_},
+       {"dims", std::vector<int64_t>{int64_t(reshape_shape_.size())}},
+       {"dtype", ONNXDataType::INT64}});
+  return CreateBaseOp(graph, node, "popart_reshape",
+                      {matmul->outputs[0], reshape_const->outputs[0]},
+                      node->outputs, {});
+}
+
+Node *matmul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X"));
+  auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y"));
+  auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha"));
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  int x_rank = x_shape.size();
+  std::vector<int64_t> perm;
+  if (x_rank == 1) {
+    perm = std::vector<int64_t>{0};
+  } else if (x_rank == 2) {
+    return CreateGemm(graph, node,
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      node->outputs, transpose_x, transpose_y, alpha);
+  } else if (x_rank == 3) {
+    perm = std::vector<int64_t>{0, 2, 1};
+  } else if (x_rank == 4) {
+    perm = std::vector<int64_t>{0, 1, 3, 2};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op matmul with input rank == %d", x_rank));
+  }
+
+  Node *x_node = GetInputVarNode("X", node);
+  Node *y_node = GetInputVarNode("Y", node);
+  if (transpose_x) {
+    x_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("X", node)}, {}, {{"perm", perm}});
+    x_node = x_node->outputs[0];
+  }
+  if (transpose_y) {
+    y_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("Y", node)}, {}, {{"perm", perm}});
+    y_node = y_node->outputs[0];
+  }
+  if (is_float_equal(alpha, 1.0)) {
+    auto o_node =
+        CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT);
+    auto const_node = CreateConst(graph, node, {}, {}, attr);
+    return CreateBaseOp(graph, node, "popart_mul",
+                        {o_node->outputs[0], const_node->outputs[0]},
+                        node->outputs);
+  } else {
+    return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
+                        node->outputs);
+  }
+}
+
+Node *sum_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_sum", node->inputs, node->outputs);
+}
+
+Node *softmax_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis);
+}
+
+Node *scale_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
+  auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
+  auto bias_after_scale_ =
+      BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
+  auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
+
+  auto new_node_bias_var =
+      CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{bias_}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", ONNXDataType::FLOAT}});
+  new_node_bias_var = new_node_bias_var->outputs[0];
+
+  Node *new_node_scale_var = nullptr;
+  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+    new_node_scale_var = GetInputVarNode("ScaleTensor", node);
+  } else {
+    new_node_scale_var =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{scale_}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::FLOAT}});
+    new_node_scale_var = new_node_scale_var->outputs[0];
+  }
+
+  // convert to float32
+  auto new_node_cast =
+      CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
+                 static_cast<int>(framework::proto::VarType::FP32));
+  Node *result = nullptr;
+  if (bias_after_scale_) {
+    auto new_node_mul =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_cast->outputs[0], new_node_scale_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_mul->outputs[0], new_node_bias_var}, {}, {});
+  } else {
+    auto new_node_add =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_cast->outputs[0], new_node_bias_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_add->outputs[0], new_node_scale_var}, {}, {});
+  }
+  auto result_after_cast =
+      CreateCast(graph, node, result->outputs, node->outputs,
+                 static_cast<int>(data_type_));
+  return result_after_cast;
+}
+
+Node *cross_entropy2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {},
+                             framework::proto::VarType::INT32);
+  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
+  if (label_shape_.size() == 1) {
+    return CreateBaseOp(graph, node, "popart_nllloss",
+                        {GetInputVarNode("X", node), new_cast->outputs[0]},
+                        {GetOutputVarNode("Y", node)},
+                        {
+                            {"ignoreIndex", ignoreIndex},
+                        });
+  } else {
+    std::vector<int64_t> new_shape_{label_shape_[0]};
+    auto const_before_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", new_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_before_loss = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {});
+
+    auto nllloss = CreateBaseOp(
+        graph, node, "popart_nllloss",
+        {GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {},
+        {
+            {"ignoreIndex", ignoreIndex},
+        });
+
+    auto const_after_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", label_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_after_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {nllloss->outputs[0], const_after_loss->outputs[0]},
+                     {GetOutputVarNode("Y", node)}, {});
+    return reshape_after_loss;
+  }
+}
+
+REGISTER_HANDLER(mean, mean_handler);
+REGISTER_HANDLER(pow, pow_handler);
+REGISTER_HANDLER(mul, mul_handler);
+REGISTER_HANDLER(matmul, matmul_handler);
+REGISTER_HANDLER(sum, sum_handler);
+REGISTER_HANDLER(softmax, softmax_handler);
+REGISTER_HANDLER(scale, scale_handler);
+REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *conv2d_handler(Graph *graph, Node *node) {
+  OpDesc *op = node->Op();
+  auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
+  auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
+  auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto pads_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  if (pads_.size() == 2) {
+    pads_.push_back(pads_[0]);
+    pads_.push_back(pads_[1]);
+  }
+  auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
+  auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
+  if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+            GetInputVarNode("Bias", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  } else {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  }
+}
+
+Node *batch_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  std::vector<Node *> inputs;
+  inputs.push_back(GetInputVarNode("X", node));
+  inputs.push_back(GetInputVarNode("Scale", node));
+  inputs.push_back(GetInputVarNode("Bias", node));
+  inputs.push_back(GetInputVarNode("Mean", node));
+  inputs.push_back(GetInputVarNode("Variance", node));
+  int64_t num_outputs = 1;
+  std::vector<Node *> outputs;
+  auto is_test_type = op->GetAttrType("is_test");
+  bool is_test;
+  if (is_test_type == 0) {
+    // int
+    is_test = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+  outputs.push_back(GetOutputVarNode("Y", node));
+  if (!is_test) {
+    outputs.push_back(GetOutputVarNode("MeanOut", node));
+    outputs.push_back(GetOutputVarNode("VarianceOut", node));
+    outputs.push_back(GetOutputVarNode("SavedMean", node));
+    outputs.push_back(GetOutputVarNode("SavedVariance", node));
+    num_outputs = 5;
+  }
+  // outputs.push_back(GetOutputVarNode("ReserveSpace", node));
+  auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum"));
+  auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  // data_layout
+  return CreateBaseOp(graph, node, "popart_batchnormalization", inputs, outputs,
+                      {
+                          {"momentum", momentum},
+                          {"epsilon", epsilon},
+                          {"num_outputs", num_outputs},
+                      });
+}
+
+Node *pool2d_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
+  auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+  if (global_pooling) {
+    if (pooling_type == "max") {
+      return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
+                          node->outputs);
+    } else if (pooling_type == "avg") {
+      return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
+                          node->outputs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn pooling_type: %s", pooling_type));
+    }
+  }
+  if (op->HasAttr("padding_algorithm")) {
+    auto padding_algorithm =
+        BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm"));
+    if (padding_algorithm != "EXPLICIT") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn padding_algorithm: %s", padding_algorithm));
+    }
+  }
+
+  auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+  auto kernel_shape = std::vector<int64_t>{ksize.begin(), ksize.end()};
+  auto ceil_mode_ = BOOST_GET_CONST(bool, op->GetAttr("ceil_mode"));
+  auto ceil_mode = int64_t(ceil_mode_ ? 1 : 0);
+  auto paddings = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  auto pads = std::vector<int64_t>{paddings.begin(), paddings.end()};
+  if (pads.size() == 2) {
+    pads.push_back(paddings[0]);
+    pads.push_back(paddings[1]);
+  }
+  auto strides_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto strides = std::vector<int64_t>{strides_.begin(), strides_.end()};
+  if (pooling_type == "max") {
+    int64_t num_outputs = 1;
+    auto dilations = std::vector<int64_t>{};
+    int64_t storage_order = 0;
+    return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
+                        node->outputs, {
+                                           {"num_outputs", num_outputs},
+                                           {"kernel_shape", kernel_shape},
+                                           {"ceil_mode", ceil_mode},
+                                           {"dilations", dilations},
+                                           {"pads", pads},
+                                           {"storage_order", storage_order},
+                                           {"strides", strides},
+                                       });
+  } else if (pooling_type == "avg") {
+    int64_t count_include_pad = 0;
+    return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
+                        node->outputs,
+                        {
+                            {"kernel_shape", kernel_shape},
+                            {"ceil_mode", ceil_mode},
+                            {"count_include_pad", count_include_pad},
+                            {"pads", pads},
+                            {"strides", strides},
+                        });
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op pool2d with unkonwn pooling_type: %s", pooling_type));
+  }
+}
+
+Node *group_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto groups_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto groups = int64_t{groups_};
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node),
+                                  GetOutputVarNode("Mean", node),
+                                  GetOutputVarNode("Variance", node)};
+  return CreateBaseOp(graph, node, "popart_groupnormalization_v2", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *instance_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node)};
+  return CreateBaseOp(graph, node, "popart_instancenormalization", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *layer_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis"));
+  auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+
+  std::vector<int64_t> norm_shape_{1, 1};
+  for (int i = 0; i < input_shape_.size(); i++) {
+    if (i < begin_norm_axis_) {
+      norm_shape_[0] *= input_shape_[i];
+    } else {
+      norm_shape_[1] *= input_shape_[i];
+    }
+  }
+
+  auto attrs1 = AttributeMap{
+      {"value", norm_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(norm_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape1_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs1);
+  auto new_node_reshape1 = CreateBaseOp(
+      graph, node, "popart_reshape",
+      {GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {});
+
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  int64_t groups_ = 1;
+  auto groupnorm_attrs_ =
+      AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
+  auto out_Y_ = MakeVarNode(graph, node);
+  CreateBaseOp(graph, node, "popart_groupnormalization_v2",
+               {new_node_reshape1->outputs[0], GetInputVarNode("Scale", node),
+                GetInputVarNode("Bias", node)},
+               {out_Y_, GetOutputVarNode("Mean", node),
+                GetOutputVarNode("Variance", node)},
+               groupnorm_attrs_);
+
+  auto attrs2 = AttributeMap{
+      {"value", input_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(input_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape2_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs2);
+  auto new_node_reshape2 = CreateBaseOp(graph, node, "popart_reshape",
+                                        {out_Y_, reshape2_const->outputs[0]},
+                                        {GetOutputVarNode("Y", node)}, {});
+  return new_node_reshape2;
+}
+
+Node *dropout_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto dropout_prob_ = BOOST_GET_CONST(float, op->GetAttr("dropout_prob"));
+  auto dropout_implementation_ =
+      BOOST_GET_CONST(std::string, op->GetAttr("dropout_implementation"));
+  auto is_test_type_ = op->GetAttrType("is_test");
+  bool is_test_;
+  if (is_test_type_ == 0) {
+    // int
+    is_test_ = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test_ = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+
+  if (is_test_) {
+    if (dropout_implementation_ == "upscale_in_train") {
+      return CreateBaseOp(graph, node, "popart_identity",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      auto scale =
+          CreateConst(graph, node, {}, {},
+                      {{"value", std::vector<float>{1 - dropout_prob_}},
+                       {"dims", std::vector<int64_t>{1}},
+                       {"dtype", ONNXDataType::FLOAT}});
+      return CreateBaseOp(graph, node, "popart_mul",
+                          {GetInputVarNode("X", node), scale->outputs[0]},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  } else {
+    if (dropout_implementation_ == "upscale_in_train") {
+      auto attrs_ =
+          AttributeMap{{"num_outputs", (int64_t)1}, {"ratio", dropout_prob_}};
+      return CreateBaseOp(graph, node, "popart_dropout",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, attrs_);
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Do not support downgrade_in_infer with training"));
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  }
+}
+
+REGISTER_HANDLER(pool2d, pool2d_handler);
+REGISTER_HANDLER(batch_norm, batch_norm_handler);
+REGISTER_HANDLER(group_norm, group_norm_handler);
+REGISTER_HANDLER(instance_norm, instance_norm_handler);
+REGISTER_HANDLER(layer_norm, layer_norm_handler);
+REGISTER_HANDLER(conv2d, conv2d_handler);
+REGISTER_HANDLER(dropout, dropout_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// singleton
+static int var_count = 0;
+static int op_count = 0;
+
+const std::string GenerateVarName() {
+  return std::string("_gen_var_") + std::to_string(var_count++);
+}
+
+const std::string GenerateOpName() {
+  return std::string("_gen_op_") + std::to_string(op_count++);
+}
+
+const std::string CreateOpIdentifyId(Node *node) {
+  // format: op_type|out_var0|out_var1|...|_gen_*
+  // this name will be used as op name when exporting onnx model from popart
+  auto op_type = node->Name();
+  std::string op_out = "";
+  for (auto *out_node : node->outputs) {
+    op_out += "|";
+    op_out += out_node->Name();
+  }
+  return {op_type + op_out + "|" + GenerateOpName()};
+}
+
+Node *MakeVarNode(Graph *graph, Node *node) {
+  auto var_name = GenerateVarName();
+  auto var_desc = std::make_unique<framework::VarDesc>(var_name);
+
+  auto var = graph->CreateVarNode(var_desc.get());
+  return var;
+}
+
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs) {
+  auto op_desc = std::make_unique<framework::OpDesc>();
+  op_desc->SetType(type);
+  auto op = graph->CreateOpNode(op_desc.get());
+
+  for (auto *in : inputs) {
+    ConnectNodes(in, op);
+  }
+  if (outputs.empty()) {
+    auto var = MakeVarNode(graph, node);
+    ConnectNodes(op, var);
+  } else {
+    for (auto *out : outputs) {
+      ConnectNodes(op, out);
+    }
+  }
+
+  // i/o
+  std::vector<std::string> input_names;
+  for (auto node : op->inputs) {
+    input_names.push_back(node->Name());
+  }
+  op->Op()->SetInput("__inputs__", input_names);
+  std::vector<std::string> output_names;
+  for (auto node : op->outputs) {
+    output_names.push_back(node->Name());
+  }
+  op->Op()->SetOutput("__outputs__", output_names);
+  op->Op()->Flush();
+
+  return op;
+}
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs) {
+  auto new_node = MakeOpNode(graph, node, type, inputs, outputs);
+  if (!attrs.empty()) {
+    new_node->Op()->SetAttrMap(attrs);
+  }
+  // deal special attr
+  if (!new_node->Op()->HasAttr(sIpuIndexAttr)) {
+    CopyOpAttr(sIpuIndexAttr, node->Op(), new_node->Op());
+  }
+  if (!new_node->Op()->HasAttr(sIpuStageAttr)) {
+    CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op());
+  }
+  {
+    new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
+    new_node->Op()->Flush();
+  }
+
+  return new_node;
+}
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs) {
+  return CreateBaseOp(graph, node, "popart_constant", inputs, outputs, attrs);
+}
+
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype) {
+  auto to = VarType2PopStr(otype);
+  return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
+                      {{"to", to}});
+}
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA,
+                 int64_t transB, float alpha, float beta) {
+  return CreateBaseOp(graph, node, "popart_gemm", inputs, outputs,
+                      {
+                          {"alpha", alpha},
+                          {"beta", beta},
+                          {"transA", transA},
+                          {"transB", transB},
+                      });
+}
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape) {
+  auto attr = AttributeMap{
+      {"value", oshape},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(oshape.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto new_node_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attr);
+  auto new_node_reshape =
+      CreateBaseOp(graph, node, "popart_reshape",
+                   {inputs[0], new_node_const->outputs[0]}, outputs);
+  return new_node_reshape;
+}
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations, int64_t group,
+                 const std::vector<int64_t> &kernel_shape,
+                 const std::vector<int64_t> &pads,
+                 const std::vector<int64_t> &strides) {
+  auto attrs = AttributeMap{
+      {"dilations", dilations},       {"group", group},
+      {"kernel_shape", kernel_shape}, {"pads", pads},
+      {"strides", strides},
+  };
+  return CreateBaseOp(graph, node, "popart_conv", inputs, outputs, attrs);
+}
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis) {
+  PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
+                                          "Softmax op only support one input"));
+  auto x_shape = inputs[0]->Var()->GetShape();
+  int x_rank = x_shape.size();
+  if (axis < 0) {
+    axis = axis + x_rank;
+  }
+  if (axis == x_rank - 1) {
+    return CreateBaseOp(graph, node, "popart_softmax", inputs, outputs,
+                        {{"axis", int64_t{-1}}});
+  } else {
+    auto perm = std::vector<int64_t>(x_rank);
+    std::iota(perm.begin(), perm.end(), 0);
+    perm[x_rank - 1] = axis;
+    perm[axis] = x_rank - 1;
+    auto new_transpose_pre = CreateBaseOp(graph, node, "popart_transpose",
+                                          inputs, {}, {{"perm", perm}});
+    auto new_softmax =
+        CreateBaseOp(graph, node, "popart_softmax", new_transpose_pre->outputs,
+                     {}, {{"axis", int64_t{-1}}});
+    return CreateBaseOp(graph, node, "popart_transpose", new_softmax->outputs,
+                        outputs, {{"perm", perm}});
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using paddle::framework::AttributeMap;
+
+template <typename T>
+AttributeMap MakeConstAttrMap(std::vector<T> value, std::vector<int64_t> dims,
+                              int dtype) {
+  return AttributeMap{{"value", value}, {"dims", dims}, {"dtype", dtype}};
+}
+
+template <typename T>
+AttributeMap MakeConstAttrMapFromValue(T v, std::vector<int64_t> dims,
+                                       int dtype) {
+  size_t size = 1;
+  for (auto &dim : dims) {
+    size *= dim;
+  }
+  return MakeConstAttrMap<T>(std::vector<T>(size, v), dims, dtype);
+}
+
+const std::string GenerateVarName();
+const std::string CreateOpIdentifyId(Node *node);
+
+Node *MakeVarNode(Graph *graph, Node *node);
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs);
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs = {});
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs);
+
+// otype is proto::VarType::Type
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype);
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA = 0,
+                 int64_t transB = 0, float alpha = 1.0f, float beta = 1.0f);
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape);
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations = {1, 1},
+                 int64_t group = 1,
+                 const std::vector<int64_t> &kernel_shape = {},
+                 const std::vector<int64_t> &pads = {0, 0, 0, 0},
+                 const std::vector<int64_t> &strides = {1, 1});
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
 OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
 OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
 OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT
+
+// clang-format on
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#endif
 #include "glog/logging.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
  if (it == device_contexts_.end()) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
-        "your train process set the correct device id if you use Executor.",
+        "with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check "
+        "that your train process set the correct device id if you use "
+        "Executor.",
        place));
  }
  return it->second.get().get();
@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
      PADDLE_THROW(
          platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                          "re-compile with WITH_XPU option."));
+#endif
+    } else if (platform::is_ipu_place(p)) {
+#ifdef PADDLE_WITH_IPU
+      EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("IPUPlace is not supported. Please "
+                                          "re-compile with WITH_IPU option."));
 #endif
    } else if (platform::is_npu_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {

 Place CPUDeviceContext::GetPlace() const { return place_; }

+#ifdef PADDLE_WITH_IPU
+IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
+  int id = place.GetDeviceId();
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  device_ = ipu_backend->GetDevice(id);
+}
+
+Place IPUDeviceContext::GetPlace() const { return place_; }
+void IPUDeviceContext::Wait() const {
+  /*! \brief  Wait for all operations completion in the stream. */
+}
+
+IPUDeviceContext::~IPUDeviceContext() {}
+
+#endif
 #ifdef PADDLE_WITH_XPU
 XPUDeviceContext::XPUDeviceContext() {
  context_ = xpu::create_context();

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
  void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
+  void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; }
  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }

 private:
@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
  return boost::apply_visitor(IsNPUPlace(), p);
 }

+bool is_ipu_place(const Place &p) {
+  return boost::apply_visitor(IsIPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
  return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
    } else if (is_npu_place(p1)) {
      return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
+    } else if (is_ipu_place(p1)) {
+      return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2);
    } else {
      return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
    }

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
  inline bool operator!=(const NPUPinnedPlace &) const { return false; }
  inline bool operator<(const NPUPinnedPlace &) const { return false; }
 };
+struct IPUPlace {
+  IPUPlace() : IPUPlace(0) {}
+  explicit IPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const IPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const IPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const IPUPlace &o) const { return device < o.device; }
+
+  int device;
+};

 struct IsCUDAPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return true; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return true; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return true; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return true; }
+};
+struct IsIPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return true; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
 };

 class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace, NPUPinnedPlace> {
+                                    CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> {
 private:
  using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                   CUDAPinnedPlace, NPUPinnedPlace>;
+                                   CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>;

 public:
  Place() = default;
  Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
  Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
+  Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {}     // NOLINT
  Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
  Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
      : PlaceBase(cuda_pinned_place) {}
@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
+bool is_ipu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool is_npu_pinned_place(const Place &);
@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
    return typename Visitor::result_type();
 #endif
  }
+  typename Visitor::result_type operator()(const IPUPlace &ipu) const {
+#ifdef PADDLE_WITH_IPU
+    return visitor_(ipu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with IPU. Cannot visit ipu device"));
+    return typename Visitor::result_type();
+#endif
+  }

  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
 pybind.h
-op_function_impl.h
\ No newline at end of file
+op_function_impl.h
+eager_op_function_impl.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -130,6 +130,10 @@ limitations under the License. */
 #endif

 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/ipu_info.h"
+#endif

 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
@@ -199,6 +203,14 @@ bool IsCompiledWithNPU() {
 #endif
 }

+bool IsCompiledWithIPU() {
+#ifndef PADDLE_WITH_IPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
  return false;
@@ -812,6 +824,8 @@ PYBIND11_MODULE(core_noavx, m) {
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
      .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
           R"DOC(
@@ -819,7 +833,7 @@ PYBIND11_MODULE(core_noavx, m) {
        
        Args:
          lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
          LoDTensor is to be set.
          zero_copy (bool, optional): Whether to share memory with the input numpy array.
          This parameter only works with CPUPlace. Default: False.
@@ -1909,6 +1923,58 @@ All parameter, weight, gradient are variables in Paddle.
           [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
      .def("__str__", string::to_string<const platform::NPUPlace &>);

+  // IPUPlace
+  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
+    IPUPlace is a descriptor of a device.
+    It represents a IPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+
+          # required: ipu
+
+          ipu_place = paddle.IPUPlace()
+
+        )DOC")
+      .def("__init__",
+           [](platform::IPUPlace &self) {
+#ifdef PADDLE_WITH_IPU
+             if (platform::GetIPUDeviceCount() == 0) {
+               LOG(ERROR) << "Cannot use IPU because there is no IPU "
+                             "detected on your "
+                             "machine.";
+               std::exit(-1);
+             }
+             // use ipu(0) to comile, while run with the number user configure
+             // in sharding and pipline.
+             new (&self) platform::IPUPlace(0);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use IPU because you didn't install IPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use IPU, please try to install IPU version "
+                 "PaddlePaddle by: pip install paddlepaddle*\n"
+                 "If you only have CPU, please change IPUPlace to be "
+                 "CPUPlace().\n");
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
+#ifdef PADDLE_WITH_IPU
+      .def("get_device_id",
+           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::IPUPlace &>);
+
  py::class_<platform::Place> platformplace(m, "Place");
  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
  platformplace.def(py::init<>())
@@ -1918,6 +1984,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
      .def("is_gpu_place",
           [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -1927,6 +1994,8 @@ All parameter, weight, gradient are variables in Paddle.
           [](platform::Place &self) { return platform::is_xpu_place(self); })
      .def("is_npu_place",
           [](platform::Place &self) { return platform::is_npu_place(self); })
+      .def("is_ipu_place",
+           [](platform::Place &self) { return platform::is_ipu_place(self); })
      .def("is_cuda_pinned_place",
           [](platform::Place &self) {
             return platform::is_cuda_pinned_place(self);
@@ -1943,6 +2012,10 @@ All parameter, weight, gradient are variables in Paddle.
           [](platform::Place &self) {
             return BOOST_GET_CONST(platform::NPUPlace, self).device;
           })
+      .def("ipu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::IPUPlace, self).device;
+           })
      .def("set_place", [](platform::Place &self,
                           const platform::Place &other) { self = other; })
      .def("set_place",
@@ -1966,6 +2039,10 @@ All parameter, weight, gradient are variables in Paddle.
           [](platform::Place &self, const platform::NPUPlace &npu_place) {
             self = npu_place;
           })
+      .def("set_place",
+           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
+             self = ipu_place;
+           })
      .def("__repr__", string::to_string<const platform::Place &>)
      .def("__str__", string::to_string<const platform::Place &>);

@@ -2197,6 +2274,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
  m.def("is_compiled_with_rocm", IsCompiledWithROCM);
  m.def("is_compiled_with_npu", IsCompiledWithNPU);
+  m.def("is_compiled_with_ipu", IsCompiledWithIPU);
  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("is_compiled_with_cinn", IsCompiledWithCINN);
@@ -2516,6 +2594,10 @@ All parameter, weight, gradient are variables in Paddle.
  });
 #endif

+#ifdef PADDLE_WITH_IPU
+  m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
+#endif
+
  py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
      .value("kDefault", platform::TracerOption::kDefault)
      .value("kOpDetail", platform::TracerOption::kOpDetail)
@@ -2593,6 +2675,11 @@ All parameter, weight, gradient are variables in Paddle.
                     bool val) { self.Set<bool>(name, new bool(val)); })
      .def("set", [](ir::Pass &self, const std::string &name,
                     int val) { self.Set<const int>(name, new int(val)); })
+      .def("set",
+           [](ir::Pass &self, const std::string &name,
+              std::vector<std::string> set) {
+             self.Set(name, new std::vector<std::string>(set));
+           })
      .def("set",
           [](ir::Pass &self, const std::string &name,
              std::unordered_set<std::string> set) {
@@ -3425,6 +3512,118 @@ All parameter, weight, gradient are variables in Paddle.
           })
      .def("device_count", &ParallelExecutor::DeviceCount);

+#ifdef PADDLE_WITH_IPU
+  py::class_<platform::ipu::IpuBackend,
+             std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
+      .def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
+      .def("clear", &platform::ipu::IpuBackend::Clear)
+      .def("set_scope", &platform::ipu::IpuBackend::SetScope)
+      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
+
+  py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
+      .def(py::init())
+      .def_property(
+          "num_ipus",
+          [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
+          [](platform::ipu::IpuStrategy &self, int num_ipus) {
+            self.num_ipus = num_ipus;
+          },
+          R"DOC(
+            Int type, set the number ipu we need. Default 1.
+          )DOC")
+      .def_property(
+          "accumulationFactor",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.accumulationFactor;
+          },
+          [](platform::ipu::IpuStrategy &self, int accumulationFactor) {
+            self.popart_options_.accumulationFactor = accumulationFactor;
+          },
+          R"DOC(
+            Specify the number of micro-batches to accumulate before
+            applying the varUpdate. Default 1.
+          )DOC")
+      .def_property("batches_per_step",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.batches_per_step;
+                    },
+                    [](platform::ipu::IpuStrategy &self, int batches_per_step) {
+                      self.batches_per_step = batches_per_step;
+                    },
+                    R"DOC(
+            Int type, set batches_per_step. Default 1.
+          )DOC")
+      .def_property("is_training",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.is_training;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool is_training) {
+                      self.is_training = is_training;
+                    },
+                    R"DOC(
+            Bool type, True for training, False inference. Default True.
+          )DOC")
+      .def_property(
+          "enable_pipelining",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.enablePipelining;
+          },
+          [](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
+            self.popart_options_.enablePipelining = enable_pipelining;
+          },
+          R"DOC(
+            Bool type, True enable pipeline, otherwise disable. Default False.
+          )DOC")
+      .def_property(
+          "enable_manual_shard",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.virtualGraphMode ==
+                   platform::ipu::VirtualGraphMode::Manual;
+          },
+          [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
+            if (enable_ipu_shard) {
+              self.popart_options_.virtualGraphMode =
+                  platform::ipu::VirtualGraphMode::Manual;
+            } else {
+              self.popart_options_.virtualGraphMode =
+                  platform::ipu::VirtualGraphMode::Off;
+            }
+          },
+          R"DOC(
+            Bool type, True enable model sharding, otherwise disable. Default "
+            "False.
+          )DOC")
+      .def_property("need_avg_shard",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.need_avg_shard;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
+                      self.need_avg_shard = need_avg_shard;
+                    },
+                    R"DOC(
+            Bool type, True enable avg shard, otherwise disable. Default False.
+          )DOC")
+      .def_property("batch_size",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.batch_size;
+                    },
+                    [](platform::ipu::IpuStrategy &self, int batch_size) {
+                      self.batch_size = batch_size;
+                    },
+                    R"DOC(
+            Int type, used to make batch size fixed. Default 1.
+          )DOC")
+      .def_property("enable_fp16",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.enable_fp16;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool enable_fp16) {
+                      self.enable_fp16 = enable_fp16;
+                    },
+                    R"DOC(
+            Bool type, True enable float16 mode, otherwise disable. Default False.)DOC");
+#endif
+
  BindFleetWrapper(&m);
  BindIO(&m);


--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {


--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use XPUPlace in CPU/GPU version, "
        "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_ipu_place(place)) {
+#ifdef PADDLE_WITH_IPU
+    if (zero_copy) {
+      auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
+      auto type = framework::ToDataType(std::type_index(typeid(T)));
+      self->ResetHolderWithType(holder, type);
+    } else {
+      auto dst = self->mutable_data<T>(place);
+      std::memcpy(dst, array.data(), array.nbytes());
+    }
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
+        "Please recompile or reinstall Paddle with IPU support."));
 #endif
  } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)

+if (NOT PYTHON_EXECUTABLE)
+  find_package(PythonInterp REQUIRED)
+endif()
+
 add_custom_command(
  OUTPUT ${api_header_file} ${api_source_file}
  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml

--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/kernel_registry.h"
+
+// TODO(chenweihang) After the kernel is split into a single file,
+// the kernel declare statement is automatically generated according to the
+// file name of the kernel, and this header file will be removed
+
+PT_DECLARE_KERNEL(full_like, CPU);
+PT_DECLARE_KERNEL(dot, CPU);
+PT_DECLARE_KERNEL(flatten, CPU);
+PT_DECLARE_KERNEL(sign, CPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(full_like, CUDA);
+PT_DECLARE_KERNEL(dot, CUDA);
+PT_DECLARE_KERNEL(flatten, CUDA);
+PT_DECLARE_KERNEL(sign, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(flatten, XPU);
+#endif
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -25,10 +25,14 @@ limitations under the License. */
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"

-PT_DECLARE_MODULE(UtilsCPU);
+PT_DECLARE_KERNEL(copy, CPU);

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(UtilsCUDA);
+PT_DECLARE_KERNEL(copy, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU);
 #endif

 namespace paddle {

--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -27,13 +27,13 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
    {"fill_any_like", "full_like"},
    {"fill_constant", "full"},
    {"flatten_contiguous_range", "flatten"},
-    // {"matmul_v2", "matmul"},
+    {"matmul_v2", "matmul"},
    {"reduce_mean", "mean"},
    {"reduce_sum", "sum"},
    {"reshape2", "reshape"},
    // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
    {"flatten", "deprecated"},
-    // {"matmul", "deprecated"},
+    {"matmul", "deprecated"},
    {"mean", "deprecated"},
    {"reshape", "deprecated"},
    {"sum", "deprecated"}};

--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -265,12 +265,8 @@ class KernelFactory {

  KernelMap& kernels() { return kernels_; }

-  void InsertCompatibleOpType(const std::string& op_type) {
-    compatible_op_types_.insert(op_type);
-  }
-
  bool HasCompatiblePtenKernel(const std::string& op_type) const {
-    return compatible_op_types_.count(TransToPtenKernelName(op_type)) > 0;
+    return kernels_.find(TransToPtenKernelName(op_type)) != kernels_.end();
  }

  const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
@@ -288,9 +284,6 @@ class KernelFactory {
  KernelFactory() = default;

  KernelMap kernels_;
-  // Used to be compatible with the original execution system and
-  // quickly confirm whether the new kernel can be called
-  std::unordered_set<std::string> compatible_op_types_;
 };

 /** operator << overload **/

--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cstring>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
@@ -24,6 +25,8 @@
 #include "paddle/pten/core/kernel_factory.h"
 #include "paddle/pten/core/kernel_utils.h"

+#include "paddle/fluid/platform/enforce.h"
+
 namespace pten {

 #define BACKEND(arg__) pten::Backend::arg__
@@ -140,7 +143,6 @@ struct KernelRegistrar {
    Kernel kernel(kernel_fn);
    args_parse_fn(kernel_key, kernel.mutable_args_def());
    args_def_fn(&kernel);
-    KernelFactory::Instance().InsertCompatibleOpType(kernel_name.name());
    KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
  }
 };
@@ -193,64 +195,35 @@ struct KernelRegistrar {
 #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

+/** PT_REGISTER_KERNEL
+ *
+ * The most frequently used kernel registration macro, used for kernel
+ * registration with only data type as template parameter, and the function
+ * pointer of the corresponding data type is automatically instantiated
+ * during registration.
+ */
 #define PT_REGISTER_KERNEL(                                       \
    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL(kernel_name,                                \
-                      PT_ID,                                      \
-                      backend,                                    \
-                      layout,                                     \
-                      meta_kernel_fn,                             \
-                      cpp_dtype,                                  \
-                      __VA_ARGS__)
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
+      pt_register_kernel_ns_check_##kernel_name,                  \
+      "PT_REGISTER_KERNEL must be called in global namespace.");  \
+  _PT_REGISTER_KERNEL(                                            \
+      kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+
 #ifndef _WIN32
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);             \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);    \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
 #else
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
-#endif
-
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
-                           meta_kernel_fn,                      \
-                           cpp_dtype,                           \
-                           __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
-  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
-
 /**
 * `template decltype(fn) fn` can work on gcc and clang,
 * but msvc will failed, error like:
@@ -261,8 +234,30 @@ struct KernelRegistrar {
 *
 *   https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
 *
- * So we solve the explict instantiation of kernel by CMake
+ * And msvc can work without template instantiation
 */
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+#endif
+
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
+                           meta_kernel_fn,                      \
+                           cpp_dtype,                           \
+                           __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
+  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)

 #define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
@@ -309,22 +304,15 @@ struct KernelRegistrar {
  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))

-#define PT_KERNEL_REGISTRAR_INIT(kernel_name,                 \
-                                 func_id,                     \
-                                 backend,                     \
-                                 layout,                      \
-                                 args_def_fn,                 \
-                                 meta_kernel_fn,              \
-                                 cpp_dtype,                   \
-                                 ...)                         \
-  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
-                            kernel_name,                      \
-                            func_id,                          \
-                            backend,                          \
-                            layout,                           \
-                            args_def_fn,                      \
-                            meta_kernel_fn,                   \
-                            cpp_dtype,                        \
+#define PT_KERNEL_REGISTRAR_INIT(                                              \
+    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__),                  \
+                            kernel_name,                                       \
+                            backend,                                           \
+                            layout,                                            \
+                            args_def_fn,                                       \
+                            meta_kernel_fn,                                    \
+                            cpp_dtype,                                         \
                            __VA_ARGS__)

 // clang-format off
@@ -333,7 +321,6 @@ struct KernelRegistrar {
  and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,              \
                                  kernel_name,    \
-                                  func_id,        \
                                  backend,        \
                                  layout,         \
                                  args_def_fn,    \
@@ -342,7 +329,6 @@ struct KernelRegistrar {
                                  ...)            \
  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
    kernel_name,                                  \
-    func_id,                                      \
    PT_ID,                                        \
    backend,                                      \
    layout,                                       \
@@ -354,7 +340,6 @@ struct KernelRegistrar {
 // clang-format on

 #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -363,17 +348,17 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
      ::pten::KernelArgsParseFunctor<decltype(                      \
          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
      args_def_fn,                                                  \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }
 #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -382,8 +367,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -392,7 +377,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -400,7 +384,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -409,8 +392,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -419,7 +402,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -427,7 +409,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -436,8 +417,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -446,7 +427,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -454,7 +434,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -463,8 +442,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -473,7 +452,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -481,7 +459,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -490,8 +467,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -500,7 +477,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -508,7 +484,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -517,8 +492,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -527,7 +502,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -535,7 +509,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -544,8 +517,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -554,7 +527,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -562,7 +534,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                    \
-                                    func_id,                        \
                                    registrar_id,                   \
                                    backend,                        \
                                    layout,                         \
@@ -571,8 +542,8 @@ struct KernelRegistrar {
                                    cpp_dtype,                      \
                                    ...)                            \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -581,7 +552,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -589,7 +559,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                   \
-                                     func_id,                       \
                                     registrar_id,                  \
                                     backend,                       \
                                     layout,                        \
@@ -598,8 +567,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                     \
                                     ...)                           \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -608,7 +577,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                \
-                                        func_id,                    \
                                        PT_ID,                      \
                                        backend,                    \
                                        layout,                     \
@@ -616,7 +584,6 @@ struct KernelRegistrar {
                                        meta_kernel_fn,             \
                                        __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                   \
-                                     func_id,                       \
                                     registrar_id,                  \
                                     backend,                       \
                                     layout,                        \
@@ -625,8 +592,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                     \
                                     ...)                           \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -635,7 +602,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,               \
-                                         func_id,                   \
                                         PT_ID,                     \
                                         backend,                   \
                                         layout,                    \
@@ -643,7 +609,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,            \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                   \
-                                     func_id,                       \
                                     registrar_id,                  \
                                     backend,                       \
                                     layout,                        \
@@ -652,8 +617,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                     \
                                     ...)                           \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -662,7 +627,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,               \
-                                         func_id,                   \
                                         PT_ID,                     \
                                         backend,                   \
                                         layout,                    \
@@ -670,7 +634,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,            \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                   \
-                                     func_id,                       \
                                     registrar_id,                  \
                                     backend,                       \
                                     layout,                        \
@@ -679,8 +642,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                     \
                                     ...)                           \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -689,7 +652,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,               \
-                                         func_id,                   \
                                         PT_ID,                     \
                                         backend,                   \
                                         layout,                    \
@@ -697,7 +659,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,            \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                   \
-                                     func_id,                       \
                                     registrar_id,                  \
                                     backend,                       \
                                     layout,                        \
@@ -706,8 +667,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                     \
                                     ...)                           \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -716,7 +677,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,               \
-                                         func_id,                   \
                                         PT_ID,                     \
                                         backend,                   \
                                         layout,                    \
@@ -724,7 +684,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,            \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                   \
-                                     func_id,                       \
                                     registrar_id,                  \
                                     backend,                       \
                                     layout,                        \
@@ -733,8 +692,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                     \
                                     ...)                           \
  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
      BACKEND(backend),                                             \
      DATALAYOUT(layout),                                           \
      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -743,7 +702,6 @@ struct KernelRegistrar {
      args_def_fn,                                                  \
      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,               \
-                                         func_id,                   \
                                         PT_ID,                     \
                                         backend,                   \
                                         layout,                    \
@@ -751,90 +709,59 @@ struct KernelRegistrar {
                                         meta_kernel_fn,            \
                                         __VA_ARGS__))

-#define PT_REGISTER_KERNEL_STANDARD(                \
-    kernel_name, backend, layout, dtype, kernel_fn) \
-  _PT_REGISTER_KERNEL_STANDARD(                     \
-      kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
-
-#define _PT_REGISTER_KERNEL_STANDARD(                                      \
-    kernel_name, func_id, backend, layout, dtype, kernel_fn)               \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
-  template decltype(kernel_fn) kernel_fn;                                  \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      DATATYPE(dtype),                                                     \
-      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,         \
-      args_def_fn,                                                         \
-      PT_KERNEL(kernel_fn));                                               \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
-
-// use to declare symbol
-#define PT_REGISTER_MODULE(name) \
-  int RegisterSymbolsFor##name() { return 0; }
-
-#define PT_DECLARE_MODULE(name)          \
-  extern int RegisterSymbolsFor##name(); \
-  UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
-
-// only used in cpp tests
-
-#define PT_REGISTER_KERNEL_FOR_TEST(                              \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL_FOR_TEST(kernel_name,                       \
-                               PT_ID,                             \
-                               backend,                           \
-                               layout,                            \
-                               meta_kernel_fn,                    \
-                               cpp_dtype,                         \
-                               __VA_ARGS__)
-
-#define _PT_REGISTER_KERNEL_FOR_TEST(                                      \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id),            \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,            \
-                             func_id)(::pten::Kernel*);                    \
-  PT_KERNEL_REGISTRAR_INIT(                                                \
-      kernel_name,                                                         \
-      func_id,                                                             \
-      backend,                                                             \
-      layout,                                                              \
-      &PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id),         \
-      meta_kernel_fn,                                                      \
-      cpp_dtype,                                                           \
-      __VA_ARGS__);                                                        \
-  void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
-                      func_id)(::pten::Kernel * kernel)
-
-#define PT_REGISTER_KERNEL_WITH_NO_TYPE(          \
-    kernel_name, backend, layout, meta_kernel_fn) \
-  _PT_REGISTER_KERNEL_WITH_NO_TYPE(               \
-      kernel_name, PT_ID, backend, layout, meta_kernel_fn)
-
-#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                                  \
-    kernel_name, func_id, backend, layout, meta_kernel_fn)                 \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  decltype(meta_kernel_fn) meta_kernel_fn;                                 \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      ::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse,    \
-      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),                  \
-      PT_KERNEL(meta_kernel_fn));                                          \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                            \
-                      func_id)(::pten::Kernel * kernel)
+/** PT_REGISTER_SINGLE_KERNEL
+ *
+ * Used to register a single kernel, pass in the complete function pointer
+ * of the kernel, this registration macro will not do automatic template
+ * instantiation.
+ */
+#define PT_REGISTER_SINGLE_KERNEL(                                           \
+    kernel_name, backend, layout, dtype, kernel_fn)                          \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      pt_register_single_kernel_ns_check_##kernel_name,                      \
+      "PT_REGISTER_SINGLE_KERNEL must be called in global namespace.");      \
+  static void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  static const ::pten::KernelRegistrar __reg_pt_single_kernel_##kernel_name( \
+      #kernel_name,                                                          \
+      BACKEND(backend),                                                      \
+      DATALAYOUT(layout),                                                    \
+      DATATYPE(dtype),                                                       \
+      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,           \
+      args_def_fn,                                                           \
+      PT_KERNEL(kernel_fn));                                                 \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }         \
+  void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*)
+
+/** PT_REGISTER_KERNEL_ALL_DTYPE
+ *
+ * Used to register a kernel that supports all data types, such as copy and
+ * reshape that are not sensitive to data types.
+ */
+#define PT_REGISTER_KERNEL_ALL_DTYPE(kernel_name, backend, layout, kernel_fn) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      pt_register_kernel_all_dtype_ns_check_##kernel_name,                    \
+      "PT_REGISTER_KERNEL_ALL_DTYPE must be called in global namespace.");    \
+  static void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(                \
+      ::pten::Kernel*);                                                       \
+  static const ::pten::KernelRegistrar                                        \
+      __reg_pt_kernel_all_dtype_##kernel_name(                                \
+          #kernel_name,                                                       \
+          BACKEND(backend),                                                   \
+          DATALAYOUT(layout),                                                 \
+          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,        \
+          &__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name,                   \
+          PT_KERNEL(kernel_fn));                                              \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }          \
+  void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+
+/** PT_DECLARE_KERNEL
+ *
+ * Used to export the symbols of the file where the kernel is located,
+ * to avoid being removed by linker
+ */
+#define PT_DECLARE_KERNEL(kernel_name, backend)                             \
+  extern int TouchKernelSymbolFor_##kernel_name##_##backend();              \
+  UNUSED static int __declare_kernel_symbol_for_##kernel_name##_##backend = \
+      TouchKernelSymbolFor_##kernel_name##_##backend()
+
 }  // namespace pten
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,

 }  // namespace pten

-PT_REGISTER_MODULE(CreationCPU);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                   CPU,
                   ANY,
                   pten::FillAnyLike,
@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
                   bool,
                   paddle::platform::float16) {}

-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                   CPU,
                   ANY,
                   pten::FillConstant,

--- a/paddle/pten/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx,

 }  // namespace pten

-PT_REGISTER_MODULE(LinalgCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;

-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                   CPU,
                   ANY,
                   pten::Dot,
@@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot",
                   complex128) {}

 PT_REGISTER_KERNEL(
-    "matmul_v2", CPU, ANY, pten::Matmul, float, double, complex64, complex128) {
-}
+    matmul, CPU, ANY, pten::Matmul, float, double, complex64, complex128) {}
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx,

 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCPU);
-
 // TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
 // architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+PT_REGISTER_KERNEL(flatten,
                   CPU,
                   ANY,
                   pten::Flatten,
@@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                   int8_t,
                   int,
                   int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                   CPU,
                   ANY,
                   pten::FlattenWithXShape,
@@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid",
                   int8_t,
                   int,
                   int64_t) {}
-PT_REGISTER_KERNEL("cast",
+
+PT_REGISTER_KERNEL(cast,
                   CPU,
                   ANY,
                   pten::Cast,
@@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast",
  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }

-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host", CPU, ANY, pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CPU, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CPU, ANY, pten::ReshapeFromDT) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul)

 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;

 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-
-PT_REGISTER_KERNEL("sign", CPU, ANY, pten::Sign, float, double) {}
-PT_REGISTER_KERNEL("mean", CPU, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CPU, ANY, pten::Sign, float, double) {}
+PT_REGISTER_KERNEL(mean, CPU, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                   CPU,
                   ANY,
                   pten::Scale,
@@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale",
                   int16_t,
                   int,
                   int64_t) {}
-
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                   CPU,
                   ANY,
                   pten::ElementwiseAdd,
@@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add",
                   int64_t,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                   CPU,
                   ANY,
                   pten::ElementwiseSub,
@@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract",
                   int64_t,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                   CPU,
                   ANY,
                   pten::ElementwiseDiv,
@@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide",
                   int64_t,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                   CPU,
                   ANY,
                   pten::ElementwiseMul,
@@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply",
                   bool,
                   complex64,
                   complex128) {}
-
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                   CPU,
                   ANY,
                   pten::Sum,

--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx,

 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CPU, ANY, pten::Copy) {}
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx,

 }  // namespace pten

-PT_REGISTER_MODULE(CreationCUDA);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                   CUDA,
                   ANY,
                   pten::FillAnyLike,
@@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like",
                   bool,
                   paddle::platform::float16) {}

-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                   CUDA,
                   ANY,
                   pten::FillConstant,

--- a/paddle/pten/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/cuda/linalg.cu
@@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx,

 }  // namespace pten

-PT_REGISTER_MODULE(LinalgCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;

-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                   CUDA,
                   ANY,
                   pten::Dot,
@@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot",
                   complex64,
                   complex128) {}

-PT_REGISTER_KERNEL("matmul_v2",
+PT_REGISTER_KERNEL(matmul,
                   CUDA,
                   ANY,
                   pten::Matmul,

--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx,

 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCUDA);
-
 using float16 = paddle::platform::float16;
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+
+PT_REGISTER_KERNEL(flatten,
                   CUDA,
                   ANY,
                   pten::Flatten,
@@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                   int8_t,
                   int,
                   int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                   CUDA,
                   ANY,
                   pten::FlattenWithXShape,
@@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid",
                   int64_t) {}

 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL("cast",                            \
+  PT_REGISTER_KERNEL(cast,                              \
                     CUDA,                              \
                     ANY,                               \
                     pten::Cast,                        \
@@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
 #endif

-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CUDA, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CUDA, ANY, pten::ReshapeFromDT) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx,

 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;

-PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CUDA, ANY, pten::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL(mean, CUDA, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                   CUDA,
                   ANY,
                   pten::Scale,
@@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale",
                   int16_t,
                   int,
                   int64_t) {}
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                   CUDA,
                   ANY,
                   pten::ElementwiseAdd,
@@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add",
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                   CUDA,
                   ANY,
                   pten::ElementwiseSub,
@@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract",
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                   CUDA,
                   ANY,
                   pten::ElementwiseDiv,
@@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide",
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                   CUDA,
                   ANY,
                   pten::ElementwiseMul,
@@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply",
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                   CUDA,
                   ANY,
                   pten::Sum,

--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
 }
 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCUDA);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CUDA, ANY, pten::Copy) {}
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data,
  }
 }

+void TensorCopy(const DenseTensor& src, DenseTensor* dst) {
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  const paddle::platform::CUDADeviceContext* dev_ctx;
+  if (paddle::platform::is_gpu_place(dst->place()) ||
+      paddle::platform::is_npu_place(dst->place())) {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(dst->place()));
+
+  } else {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(src.place()));
+  }
+
+  pten::Copy(*dev_ctx, src, false, dst);
+}
+
 template <typename Tx,
          typename Ty,
          template <typename, typename> class ReduceOp>
@@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
  if (config.reduce_num == 1) {
    auto out_dims = y->dims();
    if (x.dtype() == y->dtype()) {
-      pten::Copy(*dev_ctx, x, true, y);
+      TensorCopy(x, y);
      y->Resize(out_dims);
    } else {
      PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {

--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx,

 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationXPU);
-
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten_contiguous_range",
+PT_REGISTER_KERNEL(flatten,
                   XPU,
                   ANY,
                   pten::Flatten,
@@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
                   int,
                   int64_t) {}

-PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                   XPU,
                   ANY,
                   pten::FlattenWithXShape,
@@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                   int,
                   int64_t) {}

-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2",
-                                XPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, XPU, ANY, pten::ReshapeFromVectorVal) {}
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx,

 }  // namespace pten

-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsXPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, XPU, ANY, pten::Copy) {}
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -21,12 +21,6 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"

-PT_DECLARE_MODULE(ManipulationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(ManipulationCUDA);
-#endif
-
 namespace paddle {
 namespace tests {


--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -156,6 +156,9 @@ from .tensor.manipulation import roll  # noqa: F401
 from .tensor.manipulation import chunk  # noqa: F401
 from .tensor.manipulation import tolist  # noqa: F401
 from .tensor.manipulation import tensordot  # noqa: F401
+from .tensor.manipulation import as_complex  # noqa: F401
+from .tensor.manipulation import as_real  # noqa: F401
+
 from .tensor.math import abs  # noqa: F401
 from .tensor.math import acos  # noqa: F401
 from .tensor.math import asin  # noqa: F401
@@ -227,6 +230,8 @@ from .tensor.math import lgamma  # noqa: F401
 from .tensor.math import lerp  # noqa: F401
 from .tensor.math import rad2deg  # noqa: F401
 from .tensor.math import deg2rad  # noqa: F401
+from .tensor.math import gcd  # noqa: F401
+from .tensor.math import lcm  # noqa: F401
 from .tensor.math import diff  # noqa: F401
 from .tensor.math import angle  # noqa: F401

@@ -260,6 +265,7 @@ from .framework.random import set_cuda_rng_state  # noqa: F401
 from .framework import ParamAttr  # noqa: F401
 from .framework import create_parameter  # noqa: F401
 from .framework import CPUPlace  # noqa: F401
+from .framework import IPUPlace  # noqa: F401
 from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
@@ -291,6 +297,7 @@ from .fluid.framework import get_flags  # noqa: F401
 from .fluid.framework import set_flags  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
+from .device import is_compiled_with_ipu  # noqa: F401
 from .device import XPUPlace  # noqa: F401

 from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
@@ -478,6 +485,8 @@ __all__ = [  # noqa
           'atan2',
           'rad2deg',
           'deg2rad',
+           'gcd',
+           'lcm',
           'expand',
           'broadcast_to',
           'ones_like',
@@ -553,6 +562,8 @@ __all__ = [  # noqa
           'einsum',
           'set_flags',
           'get_flags',
+           'as_complex',
+           'as_real',
           'diff',
           'angle',
 ]
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -28,7 +28,9 @@ __all__ = [  # noqa
    'set_device',
    'get_device',
    'XPUPlace',
+    'IPUPlace',
    'is_compiled_with_xpu',
+    'is_compiled_with_ipu',
    'is_compiled_with_cinn',
    'is_compiled_with_cuda',
    'is_compiled_with_rocm',
@@ -55,6 +57,36 @@ def is_compiled_with_npu():
    return core.is_compiled_with_npu()


+def is_compiled_with_ipu():
+    """
+    Whether paddle was built with WITH_IPU=ON to support Graphcore IPU.
+
+    Returns (bool): `True` if IPU is supported, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_ipu = paddle.is_compiled_with_ipu()
+    """
+    return core.is_compiled_with_ipu()
+
+
+def IPUPlace():
+    """
+    Return a Graphcore IPU Place
+
+    Examples:
+        .. code-block:: python
+
+            # required: ipu
+
+            import paddle
+            place = paddle.device.IPUPlace()
+    """
+    return core.IPUPlace()
+
+
 def is_compiled_with_xpu():
    """
    Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
@@ -143,13 +175,19 @@ def _convert_to_place(device):
        selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
        device_id = int(selected_npus[0])
        place = core.NPUPlace(device_id)
+    elif lower_device == 'ipu':
+        if not core.is_compiled_with_ipu():
+            raise ValueError(
+                "The device should not be 'ipu', " \
+                "since PaddlePaddle is not compiled with IPU")
+        place = core.IPUPlace()
    else:
        avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
        avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
        avaliable_npu_device = re.match(r'npu:\d+', lower_device)
        if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device:
            raise ValueError(
-                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu' or 'npu:x'"
+                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu"
            )
        if avaliable_gpu_device:
            if not core.is_compiled_with_cuda():
@@ -183,13 +221,13 @@ def _convert_to_place(device):

 def set_device(device):
    """
-    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU and NPU.
+    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
    They are represented by string identifiers. This function can specify the global device
    which the OP will run.

    Parameters:
        device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x`` and ``npu:x``,
+            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
            where ``x`` is the index of the GPUs, XPUs or NPUs.

    Examples:
@@ -236,5 +274,10 @@ def get_device():
    elif isinstance(place, core.NPUPlace):
        device_id = place.get_device_id()
        device = 'npu:' + str(device_id)
+    elif isinstance(place, core.IPUPlace):
+        num_devices = core.get_ipu_device_count()
+        device = "ipus:{{0-{}}}".format(num_devices - 1)
+    else:
+        raise ValueError("The device specification {} is invalid".format(place))

    return device
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -296,6 +296,83 @@ class DistributedMatmulImpl0(DistributedOperatorImpl):
                return False
        return True

+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        input_dims_mapping = []
+        ordered_input_shard_dims_mapping = []
+
+        for dim in (x_dims_mapping + y_dims_mapping):
+            input_dims_mapping.append(dim)
+
+        for item in input_dims_mapping:
+            if item not in ordered_input_shard_dims_mapping and item != -1:
+                ordered_input_shard_dims_mapping.append(item)
+
+        for mapping in out_dims_mapping:
+            if mapping not in input_dims_mapping:
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            order_index = 0
+            for idx, item in enumerate(out_dims_mapping):
+                if item != -1:
+                    if item != ordered_input_shard_dims_mapping[order_index]:
+                        return False
+                    else:
+                        order_index += 1
+            if order_index != len(ordered_input_shard_dims_mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+        if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
+                1]):
+            return False
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            for mapping in y_dims_mapping[1:]:
+                if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
+                    return False
+
+        return True
+
    def update_dims_mapping(self, dist_op):
        changed = False
        dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -510,6 +587,95 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
                return False
        return True

+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if op_desc.attr('transpose_X') or op_desc.attr('transpose_Y'):
+            return False
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        # for gpt2, x dims > y dims, this is a temporary solution
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        x_shard_dim_count = 0
+        x_shard_dims = []
+        y_shard_dim_count = 0
+        y_shard_dims = []
+        for dim in x_dims_mapping:
+            if is_dim_shard(dim):
+                x_shard_dim_count += 1
+                x_shard_dims.append(dim)
+
+        for dim in y_dims_mapping:
+            if is_dim_shard(dim):
+                y_shard_dim_count += 1
+                y_shard_dims.append(dim)
+
+        if not x_shard_dims and not y_shard_dims:
+            return False
+
+        if x_shard_dims[-1] != y_shard_dims[0]:
+            return False
+
+        if x_shard_dim_count == y_shard_dim_count:
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    return False
+            if x_shard_dims != y_shard_dims:
+                return False
+        else:
+            if x_shard_dim_count < y_shard_dim_count:
+                return False
+            output_shard_dims = []
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    output_shard_dims.append(dim)
+            if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
+                return False
+
+        return True
+
    def update_dims_mapping(self, dist_op):
        changed = False
        dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -710,6 +876,59 @@ class DistributedMatmulImpl2(DistributedOperatorImpl):

        return True

+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping
+        ), "now just support x dims > y dims,but x:{0} and y:{1}".format(
+            x_dims_mapping, y_dims_mapping)
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+
+        return True
+
    def update_dims_mapping(self, dist_op):
        changed = False
        dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -777,6 +996,86 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
                return False
        return True

+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
+            return False
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        input_dims_mapping = []
+        ordered_input_shard_dims_mapping = []
+
+        for dim in (x_dims_mapping + y_dims_mapping):
+            input_dims_mapping.append(dim)
+
+        for item in input_dims_mapping:
+            if item not in ordered_input_shard_dims_mapping and item != -1:
+                ordered_input_shard_dims_mapping.append(item)
+
+        for mapping in out_dims_mapping:
+            if mapping not in input_dims_mapping:
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            order_index = 0
+            for idx, item in enumerate(out_dims_mapping):
+                if item != -1:
+                    if item != ordered_input_shard_dims_mapping[order_index]:
+                        return False
+                    else:
+                        order_index += 1
+            if order_index != len(ordered_input_shard_dims_mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
+                1]):
+            return False
+
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            for mapping in y_dims_mapping[1:]:
+                if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
+                    return False
+
+        return True
+
    def update_dims_mapping(self, dist_op):
        changed = False
        dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -985,6 +1284,94 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
                return False
        return True

+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
+            return False
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        x_shard_dim_count = 0
+        x_shard_dims = []
+        y_shard_dim_count = 0
+        y_shard_dims = []
+        for dim in x_dims_mapping:
+            if is_dim_shard(dim):
+                x_shard_dim_count += 1
+                x_shard_dims.append(dim)
+
+        for dim in y_dims_mapping:
+            if is_dim_shard(dim):
+                y_shard_dim_count += 1
+                y_shard_dims.append(dim)
+
+        if not x_shard_dims and not y_shard_dims:
+            return False
+
+        if x_shard_dims[-1] != y_shard_dims[0]:
+            return False
+
+        if x_shard_dim_count == y_shard_dim_count:
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    return False
+            if x_shard_dims != y_shard_dims:
+                return False
+        else:
+            if x_shard_dim_count < y_shard_dim_count:
+                return False
+            output_shard_dims = []
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    output_shard_dims.append(dim)
+            if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
+                return False
+        return True
+
    def update_dims_mapping(self, dist_op):
        changed = False
        dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -1183,6 +1570,61 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):

        return True

+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping
+        ), "now just support x dims > y dims,but x:{0} and y:{1}".format(
+            x_dims_mapping, y_dims_mapping)
+
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+
+        return True
+
    def update_dims_mapping(self, dist_op):
        changed = False
        dim_changed = _update_dims_mapping_for_matmul(dist_op)

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -27,11 +27,13 @@ from collections import OrderedDict
 import paddle
 import paddle.fluid as fluid
 from paddle import framework
+from paddle.fluid import core
 import paddle.distributed as dist
 from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm

 from ...utils.internal_storage import ParamStorage
-from ...meta_parallel.sharding.sharding_utils import Type
+from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad

 # CUDA alignment 256 bytes
 alignment = {"gpu": 256, }
@@ -99,16 +101,41 @@ class ShardingOptimizerStage2(Optimizer):

        self.broadcast_fp16 = broadcast_fp16
        self.param_storages = {}  # {dtype: {rank: InternalStorage}}
+
+        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
+            logging.warning(
+                "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
+            )
+            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
+                                                      group,
+                                                      paddle.get_device())
+
+        if offload:
+            assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
+
        self.offload = offload  # Using for offload
+        self.offload_device = "cpu"
+
+        self._master_params = {}

        # Update optimizer parameters and adjust parameter storage and use according to rank.
        self.update_opt_status()

    def _generate_master_params(self, trainable_params):
-        for param in trainable_params:
-            if param.dtype == Type.fp16.value:
-                self._optim._master_weights[param.name] = paddle.cast(
-                    param, Type.fp32.value)
+        if self.offload:
+            for param in trainable_params:
+                if param.name not in self._master_params.keys():
+                    self._master_params[param.name] = core.VarBase(
+                        name=param.name,
+                        value=param.cast(dtype=Type.fp32.value).numpy(),
+                        place=core.CPUPlace(),
+                        stop_gradient=param.stop_gradient)
+            self._optim._master_weights = self._master_params
+        else:
+            for param in trainable_params:
+                if param.dtype == Type.fp16.value:
+                    self._optim._master_weights[param.name] = paddle.cast(
+                        param, Type.fp32.value)

    def update_opt_status(self):
        """Update optimizer status and parameter storage information, and special functions to be developed.
@@ -243,22 +270,43 @@ class ShardingOptimizerStage2(Optimizer):
        A wrapper for Optimizer's step function to finish the update operation of the optimizer.
        """

-        # Synchronize optimizer parameters for the current rank
-        if len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp32.value][self.rank]
-        elif len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank]
+        if self.offload:
+            self._optim._parameter_list = [
+                param for name, param in self._master_params.items()
+            ]
        else:
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank] + self.dtype_rank_params[
+            # Synchronize optimizer parameters for the current rank
+            if len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
                    Type.fp32.value][self.rank]
+            elif len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank]
+            else:
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank] + self.dtype_rank_params[
+                        Type.fp32.value][self.rank]

        # Run the optimizer of the current rank step
-        self._optim.step()
+        if self.offload:
+            with device_guard(self.rank, self.offload_device):
+                self._optim.step()
+
+                for param in self._optim._parameter_list:
+                    self._master_params[param.name].set_value(param)
+
+            dev_id = 0 if paddle.get_device() == "cpu" else int(
+                paddle.get_device().split(":")[1])
+
+            for param in self._local_params:
+                if param.name in self._master_params.keys():
+                    param.set_value(self._master_params[param.name].cuda(dev_id)
+                                    .cast(dtype=param.dtype))
+                    self._master_params[param.name].clear_gradient(False)
+        else:
+            self._optim.step()

        # Synchronize all the updated shards in between the ranks
        self._broadcast_params()

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -112,6 +112,18 @@ class ShardingStage2(nn.Layer):
        self._has_grad_storage = []
        self._grad_storage_list = []

+        # offload
+        # TODO(haohongxiang): Now it's not supported for multi-optimizers using Offload strategy
+        self._offload_optims = list(
+            filter(lambda optim: optim.offload, self._sharding_optimizers))
+        if len(self._offload_optims) > 0:
+            assert len(
+                self._sharding_optimizers
+            ) == 1, "Only support offload strategy for single optimizer"
+
+        self._offload = self._sharding_optimizers[0].offload
+        self._offload_device = "cpu"
+
        # Set backward pass hooks
        self._bw_hooks = []

@@ -156,7 +168,8 @@ class ShardingStage2(nn.Layer):
        # Release grad storages
        for dtype in self._grad_storages.keys():
            if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.zero_()
+                if not self._offload:
+                    self._grad_storages[dtype][self._rank].buffer.zero_()

        # Release params
        for param in self._trainable_params:
@@ -167,17 +180,24 @@ class ShardingStage2(nn.Layer):
        """
        Before the gradient accumulation, scale the gradient.
        """
-        # Scale grad storages
-        for dtype in self._grad_storages.keys():
-            if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.scale_(
-                    scale=self._world_size_scaling)
-
-        # Scale params
-        for param in self._trainable_params:
-            if param.name in self._param_grads and param.grad is not None:
-                param.grad.scale_(scale=self._world_size_scaling)
-                param._reset_grad_inplace_version(True)
+        if self._offload:
+            for param in self._trainable_params:
+                if param.name in self._sharding_optimizers[
+                        0]._master_params.keys():
+                    self._sharding_optimizers[0]._master_params[
+                        param.name].grad.scale_(scale=self._world_size_scaling)
+        else:
+            # Scale grad storages
+            for dtype in self._grad_storages.keys():
+                if self._rank in self._grad_storages[dtype].keys():
+                    self._grad_storages[dtype][self._rank].buffer.scale_(
+                        scale=self._world_size_scaling)
+
+            # Scale params
+            for param in self._trainable_params:
+                if param.name in self._param_grads and param.grad is not None:
+                    param.grad.scale_(scale=self._world_size_scaling)
+                    param._reset_grad_inplace_version(True)

    def _init_internal_storage(self, needs_fresh):
        """
@@ -195,8 +215,14 @@ class ShardingStage2(nn.Layer):
        """
        Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
        """
+        assert isinstance(device, str), "Device must be type str"
        assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync"

+        self._layer.to(device=device, dtype=dtype, blocking=blocking)
+
+        # Re-build the buckets, hooks, etc..
+        self._fresh_trainable()
+
    def _fresh_trainable(self):
        """ Whether to update training parameters. """

@@ -283,12 +309,17 @@ class ShardingStage2(nn.Layer):
                    self._grad_reduced[index] = False
                    if not self._accumulate_grads:
                        param.grad.scale_(scale=self._world_size_scaling)
-                    param._reset_grad_inplace_version(True)
+                        param._reset_grad_inplace_version(True)

                    # Clear the gradient that does not belong to the current rank through the callback function
                    def cleanup():
                        if dst_rank != self._rank:
                            param.clear_gradient(False)
+                        elif self._offload:
+                            self._sharding_optimizers[0]._master_params[
+                                param.name]._copy_gradient_from(param.grad.cpu(
+                                ).cast(dtype=Type.fp32.value))
+                            param.clear_gradient(False)

                    # Synchronize the reduce parameter gradient
                    self._tasks_flow.append(
@@ -339,6 +370,15 @@ class ShardingStage2(nn.Layer):

                                grad_storage.buffer.value().get_tensor()._clear(
                                )
+                            elif self._offload:
+                                grad_storage.to(device=self._offload_device)
+                                for param in grad_storage._params:
+                                    self._sharding_optimizers[0]._master_params[
+                                        param.name]._copy_gradient_from(
+                                            param.grad.cast(
+                                                dtype=Type.fp32.value))
+                                grad_storage.buffer.value().get_tensor()._clear(
+                                )

                        # Reduce the bucket
                        grad_storage.sent = True
@@ -478,7 +518,7 @@ class ShardingStage2(nn.Layer):
        # Rebuild fp16/fp32 grad storages
        for dtype in self._grad_storages.keys():
            for dst_rank, grad_storage in self._grad_storages[dtype].items():
-                if dst_rank != self._rank:
+                if self._offload or dst_rank != self._rank:
                    grad_storage.manumal_relase()
                    grad_storage.rebuild()


--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -17,10 +17,17 @@ import contextlib
 from collections import abc
 from enum import Enum
 from math import inf
+import numpy as np
+from types import MethodType

 import paddle
 import paddle.distributed as dist
+from paddle import _C_ops
 from paddle.fluid import core
+from paddle.fluid import layers
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid.dygraph import base as imperative_base


 class Taskflow:
@@ -41,6 +48,88 @@ class Type(Enum):
    fp32 = paddle.float32


+class ShardingClipGrad:
+    def __init__(self, clip, group, device):
+        self._clip = clip
+        self._group = group
+        self._device = device
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+
+        sum_square_fp16 = []
+        sum_square_fp32 = []
+
+        for p, g in params_grads:
+            if g is None or getattr(p, 'need_clip', True) is False:
+                continue
+
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.get_tensor_from_selected_rows(
+                    layers.merge_selected_rows(g))
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+
+            if p.dtype == paddle.float16:
+                sum_square_fp16.append(sum_square)
+            elif p.dtype == paddle.float32:
+                sum_square_fp32.append(sum_square)
+
+        # global norm of non-distributed FP16 params_and_grads
+        if len(sum_square_fp16) == 0:
+            global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
+        else:
+            global_norm_fp16 = layers.concat(sum_square_fp16)
+            global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
+            global_norm_fp16 = paddle.cast(
+                global_norm_fp16, dtype=paddle.float32)
+
+        # global norm of non-distributed FP32 params_and_grads
+        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
+            sum_square_fp32) != 0 else paddle.to_tensor(
+                [0.], dtype=paddle.float32)
+        global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
+
+        global_norm_var = global_norm_fp16 + global_norm_fp32
+
+        # add all reduce to get global norm of distributed params_and_grads
+        dev_id = int(self._device.split(":")[1])
+        with device_guard(dev_id, "gpu"):
+            paddle.distributed.all_reduce(global_norm_var, group=self._group)
+
+        global_norm_var = layers.sqrt(global_norm_var)
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
+
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            if p.dtype == paddle.float16:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
+            else:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._dygraph_clip(params_grads)
+
+
 @contextlib.contextmanager
 def device_guard(dev_id, device="cpu"):
    origin_device = paddle.device.get_device()
@@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"):
        yield
    finally:
        paddle.set_device(origin_device)
+
+
+@dygraph_only
+def ShardingScaler(scaler, sharding_group):
+    def unscale_method(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = []
+        param_grads_fp16 = []
+        param_grads_fp32 = []
+
+        if getattr(optimizer, '_param_groups', None) and isinstance(
+                optimizer._param_groups[0], dict):
+
+            for group in optimizer._param_groups:
+                for param in group['params']:
+                    if param._grad_ivar() is not None:
+                        param_grads.append(param._grad_ivar())
+                        if param._grad_ivar(
+                        ).dtype == core.VarDesc.VarType.FP16:
+                            param_grads_fp16.append(param._grad_ivar())
+                        else:
+                            param_grads_fp32.append(param._grad_ivar())
+        else:
+            param_grads = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if param._grad_ivar() is not None
+            ]
+            param_grads_fp16 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
+                           )
+            ]
+            param_grads_fp32 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
+                           )
+            ]
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        if len(param_grads_fp16):
+            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                            param_grads_fp16,
+                                            temp_found_inf_fp16)
+        if len(param_grads_fp32):
+            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                            param_grads_fp32,
+                                            temp_found_inf_fp32)
+
+        self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
+        is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
+
+        paddle.distributed.all_reduce(
+            is_found_inf,
+            op=paddle.distributed.ReduceOp.MAX,
+            group=sharding_group)
+        self._found_inf = is_found_inf.numpy()[0]
+
+    scaler._unscale = MethodType(unscale_method, scaler)
+    return scaler
--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
@@ -50,6 +50,29 @@ class InternalStorage:
        else:
            self.buffer = paddle.zeros(size, dtype=dtype)

+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it"
+        assert (dtype == Type.fp32.value or
+                Type.fp16.value), "Conversion type is not supported now"
+
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+
+        if self._device != device:
+            tmp_buffer = self.buffer.cuda(
+                dev_id) if device == "gpu" else self.buffer.cpu()
+            for param in self._params:
+                param.clear_gradient(False)
+                param._gradient_set_empty(False)
+            self.buffer.value().get_tensor()._clear()
+            self.buffer = tmp_buffer
+
+        if dtype is not None:
+            self.buffer = self.buffer.cast(dtype=dtype)
+

 class ParamStorage(InternalStorage):
    """
@@ -60,6 +83,16 @@ class ParamStorage(InternalStorage):
        super().__init__(size, dtype, device, convert_cpu=True)
        self.param2align = None

+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_params()
+
    @fluid.dygraph.no_grad
    def add_rank_params(self, trainable_params, param2align):
        """
@@ -78,7 +111,7 @@ class ParamStorage(InternalStorage):
            p_shape = self._add_param_as_view(param, param2align[param.name])
            cpu_param_shape.append(p_shape)

-        # buffer covert from cpu to cuda
+        # buffer convert from cpu to cuda
        dev_id = int(paddle.get_device().split(":")[1])
        self.buffer = self.buffer.cuda(dev_id)
        self._fill = 0
@@ -109,7 +142,8 @@ class ParamStorage(InternalStorage):
        param.stop_gradient = origin_state

        # Copy the current param value
-        dev_id = int(paddle.get_device().split(":")[1])
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
        with device_guard(dev_id, "cpu"):
            tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill,
                                                             var_end))
@@ -134,6 +168,18 @@ class ParamStorage(InternalStorage):

        self._fill = offset

+    @fluid.dygraph.no_grad
+    def _array_params(self):
+        """
+        Given the parameters which have been registered previously, rebuild the whole InternalStorage.
+        """
+        assert len(self._params) > 0
+        assert self.param2align is not None
+
+        self._fill = 0
+        for p in self._params:
+            self._convert_buffer(p, p.shape, self.param2align[p.name])  # modify
+

 class GradStorage(InternalStorage):
    """
@@ -171,6 +217,18 @@ class GradStorage(InternalStorage):
            param.shape) + align <= self._max_size and id(
                param) not in self._param_ids

+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        if self._release:
+            self.rebuild()
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_grads()
+
    @fluid.dygraph.no_grad
    def add_grad(self, param, align):
        """
@@ -206,17 +264,25 @@ class GradStorage(InternalStorage):
        """
        Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
        """
-        assert len(self._params) > 0

        if self._release:
-            self.buffer = paddle.zeros(
-                [self._max_size], dtype=self._params[0].dtype)
+            self.buffer = paddle.zeros([self._max_size], dtype=self._dtype)

            for p in self._params:
                self._add_grad_as_view(p, self._parm2align[p.name])

            self._release = False

+    @fluid.dygraph.no_grad
+    def _array_grads(self):
+        """
+        Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
+        """
+        if len(self._params) > 0:
+            self._fill = 0
+            for p in self._params:
+                self._add_grad_as_view(p, self._parm2align[p.name])
+
    @fluid.dygraph.no_grad
    def _add_grad_as_view(self, param, align):
        assert np.prod(
@@ -229,8 +295,17 @@ class GradStorage(InternalStorage):
        assert offset <= np.prod(self.buffer.shape)

        # Copy the current grad value to InternalStorage
-        assert self._device == "gpu"
-        tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
-        param._copy_gradient_from(tmp_var)
-        tmp_var.value().get_tensor()._clear()
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+        if self._device == "cpu":
+            with device_guard(dev_id, self._device):
+                tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+                param._copy_gradient_from(tmp_var)
+                tmp_var.value().get_tensor()._clear()
+
+        elif self._device == "gpu":
+            tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+            param._copy_gradient_from(tmp_var)
+            tmp_var.value().get_tensor()._clear()
+
        self._fill = offset
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -305,7 +305,8 @@ class Uniform(Distribution):
        else:
            output_shape = shape + batch_shape
            output = nn.uniform_random(
-                output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
+                output_shape, dtype=self.dtype, min=0., max=1.,
+                seed=seed) * (tensor.zeros(
                    output_shape, dtype=self.dtype) + (self.high - self.low))
            output = elementwise_add(output, self.low, name=name)
            if self.all_arg_is_float:

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -71,7 +71,7 @@ from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
-from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace
 from .incubate import fleet
 from .transpiler import DistributeTranspiler, \
    memory_optimize, release_memory, DistributeTranspilerConfig
@@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \
        'CUDAPlace',
        'CUDAPinnedPlace',
        'NPUPlace',
+        'IPUPlace',
        'Tensor',
        'ParamAttr',
        'WeightNormParamAttr',
@@ -197,6 +198,11 @@ def __bootstrap__():
    if os.name == 'nt':
        remove_flag_if_exists('cpu_deterministic')

+    if core.is_compiled_with_ipu():
+        # Currently we request all ipu available for training and testing
+        #   finer control of pod of IPUs will be added later
+        read_env_flags += []
+
    core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
    # Note(zhouwei25): sys may not have argv in some cases, 
    # Such as: use Python/C API to call Python from C++

--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -484,7 +484,7 @@ class ImperativeQuantizeOutputs(object):
                model_filename=model_filename,
                params_filename=params_filename))

-        self._gather_scales(infer_program, scope)
+        self._gather_scales(infer_program, scope, fetch_targets)

        self._set_skip_quant_attr(infer_program)

@@ -520,10 +520,10 @@ class ImperativeQuantizeOutputs(object):

        return flag

-    def _gather_scales(self, program, scope):
+    def _gather_scales(self, program, scope, fetch_targets):
        """
        Get all scales from fake ops, save them into the corresponding ops
-        and delete all moving_average_abs_max_scale ops. 
+        and delete all moving_average_abs_max_scale ops.
        """

        def _gather_input_scale():
@@ -580,6 +580,11 @@ class ImperativeQuantizeOutputs(object):

                for next_op in next_ops:
                    next_op._rename_input(out_var_name, in_var_name)
+                    # If next_op is `fetch` and out_var_name in fetch_targets,
+                    # fetch_targets must update to in_var_name when rename input.
+                    for i in range(len(fetch_targets)):
+                        if fetch_targets[i].name == out_var_name:
+                            fetch_targets[i] = block.var(in_var_name)

        _gather_input_scale()
        _gather_output_scale()

--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -410,6 +410,23 @@ class PostTrainingQuantization(object):
               for op_type in self._dynamic_quantize_op_type):
            self._collect_dynamic_quantize_op_threshold(
                self._dynamic_quantize_op_type)
+
+        # Move sub blocks persistable var to global block
+        global_block = self._program.global_block()
+        for _op in global_block.ops:
+            if _op.type == "while":
+                _block_id = _op.attr("sub_block").id
+                _block = self._program.block(_block_id)
+                persistables = []
+                for _name, _var in _block.vars.items():
+                    if _var.persistable:
+                        global_block._clone_variable(_var)
+                        persistables.append(_name)
+                for _name in persistables:
+                    _block._remove_var(_name)
+                persistables.extend(_op.input('X'))
+                _op.desc.set_input("X", persistables)
+
        return self._program

    def save_quantized_model(self,
@@ -451,10 +468,6 @@ class PostTrainingQuantization(object):
                                    model_filename=self._model_filename,
                                    params_filename=self._params_filename)

-        if self._program.num_blocks > 1:
-            _logger.error("The post training quantization requires that the "
-                          "program only has one block.")
-
        if self._optimize_model:
            self._optimize_fp32_model()

@@ -505,23 +518,26 @@ class PostTrainingQuantization(object):
                    self._quantized_act_var_name.add(var_name)

        persistable_var_names = _all_persistable_var_names(self._program)
-        for op in self._program.global_block().ops:
-            op_type = op.type
-            if self._is_full_quantize and \
-                op_type not in self._quantizable_op_type:
-                _logger.warning(op_type + " is not supported for quantization.")
-            # For quantized ops, sample inputs and outputs
-            if op_type in self._quantizable_op_type:
-                collect_var_name(
-                    _get_op_input_var_names(op), persistable_var_names, op_type)
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
-            # For other op, only sample output scale
-            elif op_type in self._out_scale_op_list:
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                op_type = op.type
+                if self._is_full_quantize and \
+                    op_type not in self._quantizable_op_type:
+                    _logger.warning(op_type +
+                                    " is not supported for quantization.")
+                # For quantized ops, sample inputs and outputs
+                if op_type in self._quantizable_op_type:
+                    collect_var_name(
+                        _get_op_input_var_names(op), persistable_var_names,
+                        op_type)
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)
+                # For other op, only sample output scale
+                elif op_type in self._out_scale_op_list:
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)

    def _set_activation_persistable(self):
        '''
@@ -696,16 +712,17 @@ class PostTrainingQuantization(object):
        '''
        assert self._algo == "min_max", \
            "The algo should be min_max to save input threshold."
-        for op in self._program.global_block().ops:
-            if op.type in self._quantizable_op_type:
-                for var_name in _get_op_input_var_names(op):
-                    assert var_name in self._quantized_var_min
-                    assert var_name in self._quantized_var_max
-                    op._set_attr(var_name + ".min",
-                                 self._quantized_var_min[var_name])
-                    op._set_attr(var_name + ".max",
-                                 self._quantized_var_max[var_name])
-                    op._set_attr("with_quant_attr", True)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in self._quantizable_op_type:
+                    for var_name in _get_op_input_var_names(op):
+                        assert var_name in self._quantized_var_min
+                        assert var_name in self._quantized_var_max
+                        op._set_attr(var_name + ".min",
+                                     self._quantized_var_min[var_name])
+                        op._set_attr(var_name + ".max",
+                                     self._quantized_var_max[var_name])
+                        op._set_attr("with_quant_attr", True)

    def _collect_activation_abs_min_max(self):
        '''
@@ -795,7 +812,12 @@ class PostTrainingQuantization(object):
            activation_quantize_type=self._activation_quantize_type,
            weight_quantize_type=self._weight_quantize_type,
            quantizable_op_type=major_quantizable_op_types)
-        transform_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            # Insert fake_quant/fake_dequantize op must in test graph, so
+            # set per graph's _for_test is True.
+            sub_graph._for_test = True
+            transform_pass.apply(sub_graph)

        # use AddQuantDequantPass to insert fake_quant_dequant op
        minor_quantizable_op_types = []
@@ -806,7 +828,10 @@ class PostTrainingQuantization(object):
            scope=self._scope,
            place=self._place,
            quantizable_op_type=minor_quantizable_op_types)
-        add_quant_dequant_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            add_quant_dequant_pass.apply(sub_graph)

        # save threshold to scale var node
        if self._algo in ["KL", "hist"]:
@@ -836,7 +861,11 @@ class PostTrainingQuantization(object):
            activation_bits=self._activation_bits,
            weight_quantize_type=self._weight_quantize_type,
            quantizable_op_type=major_quantizable_op_types)
-        freeze_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            freeze_pass.apply(sub_graph)
+
        self._program = graph.to_program()

    def _save_output_threshold(self):
@@ -888,13 +917,15 @@ class PostTrainingQuantization(object):
                save_info(op_node, out_var_name, self._quantized_var_max,
                          "out_max", "post_min_max")

-        for op in self._program.global_block().ops:
-            if op.type in (self._quantizable_op_type + self._out_scale_op_list):
-                out_var_names = _get_op_output_var_names(op)
-                assert len(out_var_names) == 1, "Post training " + \
-                    "quantization only support one output for " + op.type
-                for var_name in out_var_names:
-                    analysis_and_save_info(op, var_name)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in (
+                        self._quantizable_op_type + self._out_scale_op_list):
+                    out_var_names = _get_op_output_var_names(op)
+                    assert len(out_var_names) == 1, "Post training " + \
+                        "quantization only support one output for " + op.type
+                    for var_name in out_var_names:
+                        analysis_and_save_info(op, var_name)

    def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
        """

--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -139,6 +139,7 @@ endfunction()
 if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_light_nas)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
@@ -336,6 +337,7 @@ if(NOT WIN32)
    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
    set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
    set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 120)
    set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120)
    set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
 endif()

--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
+#   copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+paddle.enable_static()
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        cmd = 'tar xf {0} -C {1}'.format(zip_path, target_folder)
+        os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(self.cache_folder, zip_path)
+        return data_cache_folder
+
+    def run_program(self, model_path, batch_size, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path,
+                    model_filename='model.pdmodel',
+                    params_filename='model.pdiparams', executor=exe)
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
+
+        img_shape = [1, 28, 28]
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            input_label = np.array([x[1] for x in data]).astype("int64")
+
+            t1 = time.time()
+            out = exe.run(infer_program,
+                          feed={feed_dict[0]: image},
+                          fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            out_label = np.argmax(np.array(out[0]), axis=1)
+            top1_num = sum(input_label == out_label)
+            test_info.append(top1_num)
+            cnt += len(data)
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        throughput = cnt / np.sum(periods)
+        latency = np.average(periods)
+        acc1 = np.sum(test_info) / cnt
+        return (throughput, latency, acc1)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        val_reader = paddle.dataset.mnist.train()
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams',
+            sample_generator=val_reader,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(
+            self.int8_model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams')
+
+    def run_test(self,
+                 model_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 batch_size=10,
+                 infer_iterations=10,
+                 quant_iterations=5):
+
+        origin_model_path = self.download_model(data_url, data_md5, model_name)
+        #origin_model_path = os.path.join(origin_model_path, model_name)
+
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            origin_model_path, batch_size, infer_iterations)
+
+        print("Start INT8 post training quantization for {0} on {1} images ...".
+              format(model_name, quant_iterations * batch_size))
+        self.generate_quantized_model(
+            origin_model_path, algo, quantizable_op_type, is_full_quantize,
+            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model_path, batch_size, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
+            format(model_name, batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
+            format(model_name, batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForWhile(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "KL"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTraininghistForWhile(TestPostTrainingQuantization):
+    def test_post_training_hist(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "hist"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingmseForWhile(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "mse"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingavgForWhile(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "avg"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingMinMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_min_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "min_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_abs_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "abs_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -273,6 +273,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
            else:
                if self._return_list:
                    data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                    data = [
                        _restore_batch(d, s)
                        for d, s in zip(data, self._structure_infos[:len(
@@ -718,6 +720,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
            else:
                if self._return_list:
                    data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                    data = [
                        _restore_batch(d, s)
                        for d, s in zip(data, self._structure_infos[:len(

--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -547,7 +547,11 @@ def func_to_source_code(function, dedent=True):
        raise TypeError(
            "The type of 'function' should be a function or method, but received {}.".
            format(type(function).__name__))
-    source_code = inspect.getsource(function)
+    source_code_list, _ = inspect.getsourcelines(function)
+    source_code_list = [
+        line for line in source_code_list if not line.lstrip().startswith('#')
+    ]
+    source_code = ''.join(source_code_list)
    if dedent:
        source_code = textwrap.dedent(source_code)


--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -238,7 +238,7 @@ def monkey_patch_varbase():
                    "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                    grad_tensor.name, grad_tensor.shape, self.name, self.shape)

-            if paddle.is_compiled_with_xpu():
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
                # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                scaled_loss = scale_loss(self)
                core.dygraph_run_backward([scaled_loss], [grad_tensor],

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1999,6 +1999,14 @@ class Executor(object):
                fetch_list=fetch_list,
                feed_var_name=feed_var_name,
                fetch_var_name=fetch_var_name)
+            main_block = cached_program.block(0)
+            for op in main_block.ops:
+                # set the op_role of fetch op to Optimize to avoid
+                # erase the fetched vars by gc for pipeline
+                if op.type == 'fetch':
+                    op._set_attr(
+                        'op_role',
+                        core.op_proto_and_checker_maker.OpRole.Optimize)
            self._add_program_cache(cache_key, cached_program)
        if cached_ctx is None:
            fleet_opt = program._pipeline_opt["fleet_opt"]
@@ -2007,6 +2015,18 @@ class Executor(object):
            self._add_ctx_cache(cache_key, cached_ctx)
        if feed:
            self._feed_data(cached_program, feed, feed_var_name, cached_scope)
+
+        from paddle.optimizer.lr import LRScheduler
+        if hasattr(program, 'lr_sheduler'):
+            lr_sheduler = program.lr_sheduler
+            assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
+            lr_value = lr_sheduler()
+            lr_var = program.global_block().vars[lr_sheduler._var_name]
+            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+            tensor = core.get_variable_tensor(cached_scope,
+                                              lr_sheduler._var_name)
+            tensor.set(data, self.place)
+
        cached_ctx.run()
        if fetch_list:
            arr = cached_scope.find_var(fetch_var_name).get_fetch_list()

--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1254,7 +1254,10 @@ class GeneratorLoader(DataLoaderBase):
    def __next__(self):
        try:
            if self._return_list:
-                return self._reader.read_next_list()
+                data = self._reader.read_next_list()
+                for i in range(len(data)):
+                    data[i] = data[i]._move_to_list()
+                return data
            else:
                return self._reader.read_next()
        except StopIteration:

--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -30,6 +30,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import Shar
 seed = 2021
 epoch = 2
 batch_size = 32
+linear_size = 10000

 strategy = fleet.DistributedStrategy()
 strategy.hybrid_configs = {
@@ -45,12 +46,12 @@ paddle.seed(seed)


 class MLP(fluid.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
+    def __init__(self, linear_size=10000, param_attr=None, bias_attr=None):
        super(MLP, self).__init__()

-        self._linear1 = Linear(10000, 10000)
-        self._linear2 = Linear(10000, 10000)
-        self._linear3 = Linear(10000, 10)
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)

    def forward(self, inputs):
        y = self._linear1(inputs)
@@ -59,10 +60,10 @@ class MLP(fluid.Layer):
        return y


-def reader_decorator():
+def reader_decorator(linear_size=10000):
    def __reader__():
        for _ in range(100):
-            img = np.random.rand(10000).astype('float32')
+            img = np.random.rand(linear_size).astype('float32')
            label = np.ones(1).astype('int64')
            yield img, label

@@ -120,6 +121,9 @@ def train_mlp(model,
        use_multiprocess=True)
    train_loader.set_sample_list_generator(train_reader)

+    if sharding_stage == 2:
+        model.to(device="gpu")
+
    for eop in range(epoch):
        model.train()

@@ -153,9 +157,6 @@ def train_mlp(model,
            if all_test and batch_id == 2:
                return model.parameters()

-    if sharding_stage == 2:
-        model.to(device="gpu")
-
    return model.parameters()



--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+from dygraph_sharding_stage2 import MLP, reader_decorator, optimizer_setting
+
+seed = 2021
+epoch = 2
+batch_size = 32
+linear_size = 8000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+def train_mlp(model, offload=False):
+    group = paddle.distributed.new_group([0, 1])
+    optimizer = optimizer_setting(model=model, use_pure_fp16=True)
+
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+    scaler = ShardingScaler(scaler, group)
+
+    optimizer = ShardingOptimizerStage2(
+        params=model.parameters(),
+        optim=optimizer,
+        group=group,
+        offload=offload)
+    model = ShardingStage2(model, optimizer, group=group, accumulate_grads=True)
+
+    train_reader = paddle.batch(
+        reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            scaler.scale(avg_loss).backward()
+
+            model.grad_scale()
+            scaler.step(optimizer)
+            scaler.update()
+            model.clear_gradients()
+
+    for dtype in optimizer.param_storages:
+        for dst_rank, param_storage in optimizer.param_storages[dtype].items():
+            param_storage.to(device="gpu", dtype=dtype)
+
+    return model.parameters()
+
+
+def test_sharding_stage2_offload():
+    mlp = MLP(linear_size)
+    mlp_offload = MLP(linear_size)
+    mlp_offload.set_state_dict(mlp.state_dict())
+
+    mlp_params = train_mlp(mlp, offload=False)
+    mlp_offload_params = train_mlp(mlp_offload, offload=True)
+
+    for i in range(len(mlp_params)):
+        for j in range(len(mlp_offload_params)):
+            if mlp_params[i].name == mlp_offload_params[j].name:
+                np.testing.assert_allclose(
+                    mlp_params[i].numpy(),
+                    mlp_offload_params[j].numpy(),
+                    rtol=1e-6)
+    return
+
+
+if __name__ == '__main__':
+    test_sharding_stage2_offload()
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -26,6 +26,7 @@ import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code

 from ifelse_simple_func import dyfunc_with_if_else

@@ -344,5 +345,18 @@ class TestFunctionTrainEvalMode(unittest.TestCase):
            net.foo.train()


+class TestRemoveCommentInDy2St(unittest.TestCase):
+    def func_with_comment(self):
+        # Comment1
+        x = paddle.to_tensor([1, 2, 3])
+        # Comment2
+        # Comment3
+        y = paddle.to_tensor([4, 5, 6])
+
+    def test_remove_comment(self):
+        code_string = func_to_source_code(self.func_with_comment)
+        self.assertEqual('#' not in code_string, True)
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -322,14 +322,14 @@ class PassAutoScanTest(AutoScanTest):
            "Expected operator list after fusion is {}, but now it's {}".format(
                op_list_after_fusion, after_op_list), )

-    def run_and_statis(
-            self,
-            quant=False,
-            max_examples=100,
-            reproduce=None,
-            min_success_num=25,
-            max_duration=180,
-            passes=None, ):
+    def run_and_statis(self,
+                       quant=False,
+                       max_examples=100,
+                       reproduce=None,
+                       min_success_num=25,
+                       max_duration=180,
+                       passes=None,
+                       use_gpu_run_baseline=False):
        if os.getenv('HYPOTHESIS_TEST_PROFILE', 'ci') == "dev":
            max_examples *= 10
            min_success_num *= 10
@@ -354,7 +354,10 @@ class PassAutoScanTest(AutoScanTest):
            return self.sample_program_config(draw)

        def run_test(prog_config):
-            return self.run_test(quant=quant, prog_configs=[prog_config])
+            return self.run_test(
+                quant=quant,
+                prog_configs=[prog_config],
+                use_gpu_run_baseline=use_gpu_run_baseline)

        generator = st.composite(program_generator)
        loop_func = given(generator())(run_test)
@@ -371,8 +374,8 @@ class PassAutoScanTest(AutoScanTest):
        logging.info("Number of Ran Programs: {}".format(self.num_ran_programs))
        logging.info("Number of Ignore Tests: {}".format(self.num_ignore_tests))
        successful_ran_programs = int(self.num_ran_programs -
-                                      self.num_ignore_tests /
-                                      self.num_predictor_kinds)
+                                      self.num_ignore_tests / max(
+                                          self.num_predictor_kinds, 1))
        logging.info(
            "Number of successfully ran programs approximately equal to {}".
            format(successful_ran_programs))
@@ -391,7 +394,10 @@ class PassAutoScanTest(AutoScanTest):
                format(max_duration))
            assert False

-    def run_test(self, quant=False, prog_configs=None):
+    def run_test(self,
+                 quant=False,
+                 prog_configs=None,
+                 use_gpu_run_baseline=False):
        status = True

        for prog_config in prog_configs:
@@ -413,7 +419,9 @@ class PassAutoScanTest(AutoScanTest):
            results: List[Dict[str, np.ndarray]] = []

            # baseline: cpu no ir_optim run
-            base_config = self.create_inference_config(ir_optim=False)
+
+            base_config = self.create_inference_config(
+                ir_optim=False, use_gpu=use_gpu_run_baseline)
            logging.info('RUN program_config: ' + str(prog_config))
            results.append(
                self.run_test_config(model, params, prog_config, base_config,

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
@@ -109,7 +109,7 @@ class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
    def test(self):
        self.run_and_statis(
            quant=False,
-            max_examples=100,
+            max_examples=300,
            passes=["adaptive_pool2d_convert_global_pass"],
            min_success_num=40)


--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestFCElementwiseLayerNormFusePass(PassAutoScanTest):
+    """
+    x_var   w(persistable) bias_var(persistable)
+      \     |              /
+          fc     
+          |
+      fc_out_var  bias_var(persistable)
+            \        /
+          elementwise_add  bias_var(persistable)  scale_var(persistable)
+                  \            |                       /                    
+                           layer_norm
+                         /      |         \
+                        Y    mean_var  variance_var
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fused_fc_elementwise_layernorm"], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape of input:X of fc
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=5))
+        x_shape = [2, 1]
+        x_rank = len(x_shape)
+        # 2. Generate attr:in_num_col_dims of fc
+        in_num_col_dims = draw(st.integers(min_value=1, max_value=x_rank - 1))
+        # 3. Generate legal shape of input:W/bias of fc
+        w_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        w_shape[0] = int(np.prod(x_shape[in_num_col_dims:]))
+        w_shape = [1, 2]
+        fc_bias_shape = [w_shape[1], ]
+        if draw(st.booleans()):
+            fc_bias_shape.insert(0, 1)
+        fc_bias_shape = [2, ]
+        fc_out_shape = x_shape[:in_num_col_dims] + w_shape[1:]
+        # 4. Generate legal attr:axis/shape of elementwise_add
+        add_bias_shape = fc_out_shape[:]
+        axis = draw(st.integers(min_value=-1, max_value=0))
+        # 5. Generate legal shape of layer_norm
+        begin_norm_axis = draw(
+            st.integers(
+                min_value=1, max_value=len(fc_out_shape) - 1))
+        layer_norm_shape = [int(np.prod(fc_out_shape[begin_norm_axis:]))]
+        epsilon = 1e-5
+
+        fc_op = OpConfig(
+            "fc",
+            inputs={"Input": ["fc_x"],
+                    "W": ["fc_w"],
+                    "Bias": ["fc_bias"]},
+            outputs={"Out": ["fc_out"]},
+            in_num_col_dims=in_num_col_dims,
+            padding_weights=False,
+            activation_type="",
+            use_quantizer=False,
+            use_mkldnn=False, )
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["fc_out"],
+                    "Y": ["add_bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis, )
+        layer_norm_op = OpConfig(
+            "layer_norm",
+            inputs={
+                "X": ["add_out"],
+                "Scale": ["scale"],
+                "Bias": ["layer_norm_bias"]
+            },
+            outputs={
+                "Y": ["layer_norm_out"],
+                "Mean": ["layer_norm_mean"],
+                "Variance": ["layer_norm_var"]
+            },
+            begin_norm_axis=begin_norm_axis,
+            epsilon=epsilon)
+
+        ops = [fc_op, add_op, layer_norm_op]
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "fc_w": TensorConfig(shape=w_shape),
+                "fc_bias": TensorConfig(shape=fc_bias_shape),
+                "add_bias": TensorConfig(shape=add_bias_shape),
+                "scale": TensorConfig(shape=layer_norm_shape),
+                "layer_norm_bias": TensorConfig(shape=layer_norm_shape),
+            },
+            inputs={"fc_x": TensorConfig(shape=x_shape), },
+            outputs=ops[-1].outputs["Y"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["fc_elementwise_layernorm_fuse_pass"],
+            use_gpu_run_baseline=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,72 +12,147 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
-import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import PassVersionChecker

+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st

-class TransposeFlattenConcatFusePassTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1)
-            flatt2 = fluid.layers.flatten(trans2)
-            concat_out = fluid.layers.concat([flatt1, flatt2])
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)

-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+class TestTransposeFlattenConcatFusePass(PassAutoScanTest):
+    """
+        x_1_var              x_2_var
+          |                     |
+      transpose2            transpose2
+          |                     | 
+       flatten2              flatten2
+          \                     /
+    flatten2_out_var    flatten2_out_var
+              \              /
+                   concat 
+    """

-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+    def sample_predictor_configs(self, program_config):
+        # TRT  
+        # after tensorrt_subgraph_pass ，The pass needs to be deleted on TRT

-        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fusion_transpose_flatten_concat", ], (1e-5, 1e-5)

+    def is_program_valid(self, prog_config):
+        concat_axis = prog_config.ops[-1].attrs["axis"]
+        ops_num = len(prog_config.ops) - 1
+        if ops_num % 2 != 0:
+            return False
+        input_num = ops_num // 2
+        flatten_shape = 0
+        x_trans_axis = prog_config.ops[0].attrs["axis"]
+        x_flatten_axis = prog_config.ops[1].attrs["axis"]
+        for i in range(input_num):
+            input_name = "transpose2_x" + str(i)
+            input_shape = prog_config.inputs[input_name].shape
+            trans_axis = prog_config.ops[i * 2].attrs["axis"]
+            if x_trans_axis != trans_axis:
+                return False
+            #  calculate shape after transpose
+            input_shape = [input_shape[j] for j in trans_axis]
+            #  calculate shape after flateen
+            flatten_axis = prog_config.ops[i * 2 + 1].attrs["axis"]
+            if x_flatten_axis != flatten_axis:
+                return False
+            flatten_shape1 = flatten_shape2 = 1
+            for j in range(len(input_shape)):
+                if j < flatten_axis:
+                    flatten_shape1 *= input_shape[j]
+                else:
+                    flatten_shape2 *= input_shape[j]
+            if concat_axis == 0:
+                if i == 0:
+                    flatten_shape = flatten_shape2
+                elif flatten_shape != flatten_shape2:
+                    return False
+            else:
+                if i == 0:
+                    flatten_shape = flatten_shape1
+                elif flatten_shape != flatten_shape1:
+                    return False
+        return True

-class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1, axis=2)
-            flatt2 = fluid.layers.flatten(trans2, axis=2)
-            concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
+    def sample_program_config(self, draw):
+        times = draw(st.integers(min_value=1, max_value=6))
+        concat_axis = draw(st.integers(min_value=0, max_value=1))
+        ops = []
+        concat_input = []
+        inputs = {}
+        x_shape_rank = draw(st.integers(min_value=2, max_value=5))
+        #  Generate axis of transpose
+        trans_axis = [j for j in range(x_shape_rank)]
+        for j in range(x_shape_rank - 1):
+            if draw(st.booleans()):
+                trans_axis[j], trans_axis[-1] = trans_axis[-1], trans_axis[j]
+        #  Generate axis of flatten
+        flatten_axis = draw(
+            st.integers(
+                min_value=0, max_value=x_shape_rank - 1))
+        for i in range(times):
+            #  Generate x_shape of transpose
+            x_shape = draw(
+                st.lists(
+                    st.integers(
+                        min_value=1, max_value=10),
+                    min_size=x_shape_rank,
+                    max_size=x_shape_rank))

-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+            str_i = str(i)
+            transpose_op = OpConfig(
+                "transpose2",
+                inputs={"X": ["transpose2_x" + str_i], },
+                axis=trans_axis,
+                outputs={
+                    "Out": ["trans_out" + str_i],
+                    "XShape": ["trans_shape" + str_i]
+                }, )
+            ops.append(transpose_op)
+            flatten_op = OpConfig(
+                "flatten2",
+                inputs={"X": ["trans_out" + str_i], },
+                axis=flatten_axis,
+                outputs={
+                    "Out": ["flatten2_out" + str_i],
+                    "XShape": ["xshape" + str_i]
+                }, )
+            concat_input.append("flatten2_out" + str_i)
+            ops.append(flatten_op)
+            inputs["transpose2_x" + str_i] = TensorConfig(shape=x_shape)

-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+        concat_op = OpConfig(
+            "concat",
+            inputs={
+                "X": concat_input,
+                "AxisTensor": [],
+            },
+            outputs={"Out": ["concat_out"]},
+            axis=concat_axis, )

-        self.assertTrue(
-            PassVersionChecker.IsCompatible(
-                'transpose_flatten_concat_fuse_pass'))
+        ops.append(concat_op)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs=inputs,
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["transpose_flatten_concat_fuse_pass"])


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import copy
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.fluid.core as core
+from paddle.fluid import layers
+from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImplContainer
+from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImpl
+from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_op import DistributedOperator
+paddle.enable_static()
+device = "gpu" if core.is_compiled_with_cuda() else "cpu"
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sqrt_hidden_size = 32
+        double_hidden_size = 64
+
+        input = static.data(name="input", shape=[8, 8, 16], dtype='int32')
+        input = paddle.reshape(input, [hidden_size])
+        input = paddle.reshape(input, [sqrt_hidden_size, sqrt_hidden_size])
+        embedding = paddle.nn.Embedding(2, batch_size, sparse=True)
+        input = embedding(input)
+        input = paddle.reshape(input, [hidden_size, batch_size])
+        input = paddle.transpose(input, perm=[1, 0])
+        matmulinput = static.data(
+            name="matmulinput",
+            shape=[hidden_size, hidden_size],
+            dtype='float32')
+        input = layers.matmul(x=input, y=matmulinput)
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+        m = paddle.nn.Softmax()
+        loss = m(loss)
+    return loss, train_program, start_program
+
+
+class Testcompatible(unittest.TestCase):
+    def test_matmulv2_matmul_2_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+    def test_matmulv2_matmul_1_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 6, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1])
+                    dist_op = DistributedOperator(op, op_dist_attr)
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertTrue(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(out, [-1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
+                    self.assertTrue(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+    def test_matmulv2_matmul_0_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, 0])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [0, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 0, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, 1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_complex_view_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle
+from paddle.fluid import dygraph
+from paddle import static
+paddle.enable_static()
+
+
+def ref_view_as_complex(x):
+    real, imag = np.take(x, 0, axis=-1), np.take(x, 1, axis=-1)
+    return real + 1j * imag
+
+
+def ref_view_as_real(x):
+    return np.stack([x.real, x.imag], -1)
+
+
+class TestViewAsComplexOp(OpTest):
+    def setUp(self):
+        self.op_type = "as_complex"
+        x = np.random.randn(10, 10, 2).astype("float64")
+        out_ref = ref_view_as_complex(x)
+        self.out_grad = np.ones(
+            [10, 10], dtype="float64") + 1j * np.ones(
+                [10, 10], dtype="float64")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[ref_view_as_real(self.out_grad)],
+            user_defined_grad_outputs=[self.out_grad])
+
+
+class TestViewAsRealOp(OpTest):
+    def setUp(self):
+        self.op_type = "as_real"
+        real = np.random.randn(10, 10).astype("float64")
+        imag = np.random.randn(10, 10).astype("float64")
+        x = real + 1j * imag
+        out_ref = ref_view_as_real(x)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_ref}
+        self.out_grad = np.ones([10, 10, 2], dtype="float64")
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[ref_view_as_complex(self.out_grad)],
+            user_defined_grad_outputs=[self.out_grad])
+
+
+class TestViewAsComplexAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(10, 10, 2)
+        self.out = ref_view_as_complex(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.as_complex(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[10, 10, 2], dtype="float64")
+            out = paddle.as_complex(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+class TestViewAsRealAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(10, 10) + 1j * np.random.randn(10, 10)
+        self.out = ref_view_as_real(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.as_real(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[10, 10], dtype="complex128")
+            out = paddle.as_real(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -34,7 +34,8 @@ class TestCUDAGraph(unittest.TestCase):
            paddle.set_flags({
                'FLAGS_allocator_strategy': 'auto_growth',
                'FLAGS_sync_nccl_allreduce': False,
-                'FLAGS_cudnn_deterministic': True
+                'FLAGS_cudnn_deterministic': True,
+                'FLAGS_use_stream_safe_cuda_allocator': False,
            })

    def random_tensor(self, shape):
@@ -187,6 +188,48 @@ class TestCUDAGraph(unittest.TestCase):
        finally:
            graph.reset()

+    def test_dataloader(self):
+        if not can_use_cuda_graph():
+            return
+
+        class AutoIncDataset(paddle.io.Dataset):
+            def __init__(self, n, dtype):
+                self.n = n
+                self.dtype = dtype
+
+            def __len__(self):
+                return self.n
+
+            def __getitem__(self, idx):
+                return np.array([idx]).astype(self.dtype)
+
+        n = 100
+        dtype = 'int64'
+        dataset = AutoIncDataset(n, dtype)
+        data_loader = paddle.io.DataLoader(
+            dataset, batch_size=1, num_workers=2, use_buffer_reader=True)
+        x = None
+        y = None
+
+        graph = None
+        for i, data in enumerate(data_loader):
+            if graph is None:
+                x = data
+                x = x.cuda()
+                graph = CUDAGraph()
+                graph.capture_begin()
+                y = x * x
+                graph.capture_end()
+            else:
+                x.copy_(data, False)
+                x = x.cuda()
+
+            graph.replay()
+            actual_x = np.array([[i]]).astype(dtype)
+            actual_y = np.array([[i * i]]).astype(dtype)
+            self.assertTrue(np.array_equal(actual_x, x.numpy()))
+            self.assertTrue(np.array_equal(actual_y, y.numpy()))
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -336,6 +336,29 @@ class UniformTest11(UniformTest):
                name='values', shape=[dims], dtype='float32')


+class UniformTestSample(unittest.TestCase):
+    def setUp(self):
+        self.init_param()
+
+    def init_param(self):
+        self.low = 3.0
+        self.high = 4.0
+
+    def test_uniform_sample(self):
+        paddle.disable_static()
+        uniform = Uniform(low=self.low, high=self.high)
+        s = uniform.sample([100])
+        self.assertTrue((s >= self.low).all())
+        self.assertTrue((s < self.high).all())
+        paddle.enable_static()
+
+
+class UniformTestSample2(UniformTestSample):
+    def init_param(self):
+        self.low = -5.0
+        self.high = 2.0
+
+
 class NormalNumpy(DistributionNumpy):
    def __init__(self, loc, scale):
        self.loc = np.array(loc)

--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus):
    def test_dygraph_sharding_optimizer_stage2(self):
        self.run_mnist_2gpu('dygraph_sharding_stage2.py')

+    def test_dygraph_sharding_optimizer_stage2_offload(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py')
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -47,6 +47,18 @@ class TestFleetExecutor(unittest.TestCase):
                name='y', shape=y_data.shape, dtype=y_data.dtype)
            z = x + y
            a = 2 * x + 3 * y
+            loss = paddle.mean(a)
+            base_lr = 0.1
+            passes = [30, 60, 80, 90]
+            steps_per_pass = 10
+            bd = [steps_per_pass * p for p in passes]
+            lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=bd, values=lr)
+            opt = paddle.optimizer.AdamW(
+                learning_rate=lr_val,
+                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+            opt.minimize(loss)
        # TODO: section_program will be removed in the future
        empty_program._pipeline_opt = {
            "fleet_opt": self.fake_fleet_opt(),

--- a/python/paddle/fluid/tests/unittests/test_gcd.py
+++ b/python/paddle/fluid/tests/unittests/test_gcd.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestGcdAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            y = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.gcd(x, y)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.gcd(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np)
+        y = paddle.to_tensor(self.y_np)
+        result = paddle.gcd(x, y)
+        self.assertEqual(
+            np.allclose(np.gcd(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestGcdAPI2(TestGcdAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestGcdAPI3(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI4(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI5(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
--- a/python/paddle/fluid/tests/unittests/test_lcm.py
+++ b/python/paddle/fluid/tests/unittests/test_lcm.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestLcmAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x1 = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            x2 = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.lcm(x1, x2)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.lcm(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor(self.x_np)
+        x2 = paddle.to_tensor(self.y_np)
+        result = paddle.lcm(x1, x2)
+        self.assertEqual(
+            np.allclose(np.lcm(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestLcmAPI2(TestLcmAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestLcmAPI3(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI4(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI5(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -23,6 +23,7 @@ from .framework import set_grad_enabled  # noqa: F401
 from ..fluid.param_attr import ParamAttr  # noqa: F401
 from ..fluid.layers.tensor import create_parameter  # noqa: F401
 from ..fluid.core import CPUPlace  # noqa: F401
+from ..fluid.core import IPUPlace  # noqa: F401
 from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
 from ..fluid.core import NPUPlace  # noqa: F401

--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -111,6 +111,9 @@ from .manipulation import unbind  # noqa: F401
 from .manipulation import roll  # noqa: F401
 from .manipulation import chunk  # noqa: F401
 from .manipulation import tensordot  # noqa: F401
+from .manipulation import as_complex  # noqa: F401
+from .manipulation import as_real  # noqa: F401
+
 from .math import abs  # noqa: F401
 from .math import acos  # noqa: F401
 from .math import asin  # noqa: F401
@@ -194,6 +197,8 @@ from .math import lerp  # noqa: F401
 from .math import lerp_  # noqa: F401
 from .math import rad2deg  # noqa: F401
 from .math import deg2rad  # noqa: F401
+from .math import gcd  # noqa: F401
+from .math import lcm  # noqa: F401
 from .math import diff  # noqa: F401
 from .math import angle  # noqa: F401

@@ -409,6 +414,12 @@ tensor_method_func  = [ #noqa
           'multi_dot',
           'solve',
           'triangular_solve',
+           'as_complex',
+           'as_real',
+           'rad2deg',
+           'deg2rad',
+           'gcd',
+           'lcm',
           'diff',
           'lerp',
           'lerp_',

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -34,6 +34,7 @@ from ..fluid import layers
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
 from paddle import _C_ops
+from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype

 __all__ = []

@@ -2488,3 +2489,94 @@ def tensordot(x, y, axes=2, name=None):
        [contraction_size, not_contraction_size_y])
    out = x.matmul(y).reshape(shape_out)
    return out
+
+
+def as_complex(x, name=None):
+    """Transform a real tensor to a complex tensor. 
+    
+    The data type of the input tensor is 'float32' or 'float64', and the data
+    type of the returned tensor is 'complex64' or 'complex128', respectively.
+
+    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e. 
+    the size of the last axis shoule be 2, which represent the real and imag part
+    of a complex number. The shape of the returned tensor is ``(*,)``.
+
+    Args:
+        x (Tensor): The input tensor. Data type is 'float32' or 'float64'.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            y = paddle.as_complex(x)
+            print(y.numpy())
+
+            # [[ 0. +1.j  2. +3.j  4. +5.j]
+            #  [ 6. +7.j  8. +9.j 10.+11.j]]
+    """
+    if in_dygraph_mode():
+        return paddle._C_ops.as_complex(x)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'as_complex')
+    op_type = "as_complex"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_real_to_complex_dtype(x.dtype))
+    outputs = {"Out": out}
+    attrs = {}
+    helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
+    return out
+
+
+def as_real(x, name=None):
+    """Transform a complex tensor to a real tensor. 
+    
+    The data type of the input tensor is 'complex64' or 'complex128', and the data 
+    type of the returned tensor is 'float32' or 'float64', respectively.
+
+    When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
+    the shape of the output tensor is ``(*, 2)``, i.e. the shape of the output is
+    the shape of the input appended by an extra ``2``.
+
+    Args:
+        x (Tensor): The input tensor. Data type is 'complex64' or 'complex128'.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            y = paddle.as_complex(x)
+            z = paddle.as_real(y)
+            print(z.numpy())
+
+            # [[[ 0.  1.]
+            #   [ 2.  3.]
+            #   [ 4.  5.]]
+
+            #  [[ 6.  7.]
+            #   [ 8.  9.]
+            #   [10. 11.]]]
+    """
+    if in_dygraph_mode():
+        return paddle._C_ops.as_real(x)
+
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'as_real')
+    op_type = "as_real"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_complex_to_real_dtype(x.dtype))
+    outputs = {"Out": out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None):
            lerp(x, y, weight) = x + weight * (y - x).

    Args:
-        x (Tensor): An N-D Tensor, the data type is float32, float64.
-        y (Tensor): An N-D Tensor, the data type is float32, float64.
-        weight (float|Tensor): the weight for the interpolation formula.
+        x (Tensor): An N-D Tensor with starting points, the data type is float32, float64.
+        y (Tensor): An N-D Tensor with ending points, the data type is float32, float64.
+        weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

    Returns:
@@ -2788,6 +2788,139 @@ def deg2rad(x, name=None):
            type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale})
        return out

+def gcd(x, y, name=None):
+    """
+    Computes the element-wise greatest common divisor (GCD) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        gcd(0,0)=0, gcd(0, y)=|y|
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.gcd(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.gcd(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20, 1 , 2 , 1 , 4 , 5])
+
+            x4 = paddle.to_tensor(0)
+            paddle.gcd(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20])
+
+            paddle.gcd(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.gcd(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+    """
+    shape = paddle.broadcast_shape(x.shape, y.shape)
+    x = paddle.broadcast_to(x, shape)
+    y = paddle.broadcast_to(y, shape)
+    x = paddle.abs(x)
+    y = paddle.abs(y)
+
+    def _gcd_cond_fn(x, y):
+        return paddle.any(y != 0)
+
+    def _gcd_body_fn(x, y):
+        # paddle.mod will raise an error when any element of y is 0. To avoid
+        # that, we change those zeros to ones. Their values don't matter because
+        # they won't be used.
+        y_not_equal_0 = (y != 0)
+        y_safe = paddle.where(y_not_equal_0, y, paddle.ones(y.shape, y.dtype))
+        x, y = (paddle.where(y_not_equal_0, y, x),
+                  paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype)))
+        return (paddle.where(x < y, y, x), paddle.where(x < y, x, y))
+
+    if in_dygraph_mode():
+        while _gcd_cond_fn(x, y):
+            x, y = _gcd_body_fn(x, y)
+
+        return x
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y])
+        return out
+
+def lcm(x, y, name=None):
+    """
+    Computes the element-wise least common multiple (LCM) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        lcm(0,0)=0, lcm(0, y)=0
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.lcm(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.lcm(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0, 20, 20, 60, 20, 20])
+
+            x4 = paddle.to_tensor(0)
+            paddle.lcm(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+
+            paddle.lcm(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.lcm(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+    """
+    d = paddle.gcd(x, y)
+    # paddle.mod will raise an error when any element of y is 0. To avoid
+    # that, we change those zeros to ones. Their values don't matter because
+    # they won't be used.
+    d_equal_0 = paddle.equal(d, 0)
+    d_safe = paddle.where(d_equal_0, paddle.ones(d.shape, d.dtype), d)
+    out = paddle.where(d_equal_0, paddle.zeros(d.shape, d.dtype), paddle.abs(x * y) // d_safe)
+    return out
+
 def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
    r"""
    Computes the n-th forward difference along the given axis.
@@ -2949,7 +3082,6 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):

        return out

-
 def angle(x, name=None):
    r"""
    Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while 
@@ -2965,7 +3097,7 @@ def angle(x, name=None):
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        out (Tensor): y (Tensor): An N-D Tensor of real data type with the same precision as that of x's data type.
+        Tensor: An N-D Tensor of real data type with the same precision as that of x's data type.

    Examples:
        .. code-block:: python

--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -76,7 +76,7 @@
  infer_meta : 
    func : MatmulInferMeta
  kernel : 
-    func : matmul_v2
+    func : matmul

 - api : mean
  args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)

--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -345,6 +345,7 @@ def source_include(header_file_path):
 #include "glog/logging.h"

 #include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_declare.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -353,22 +354,6 @@ def source_include(header_file_path):
 """


-def module_declare():
-    return """
-PT_DECLARE_MODULE(CreationCPU);
-PT_DECLARE_MODULE(LinalgCPU);
-PT_DECLARE_MODULE(ManipulationCPU);
-PT_DECLARE_MODULE(MathCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(CreationCUDA);
-PT_DECLARE_MODULE(LinalgCUDA);
-PT_DECLARE_MODULE(ManipulationCUDA);
-PT_DECLARE_MODULE(MathCUDA);
-#endif
-"""
-
-
 def api_register():
    return """
 PT_REGISTER_API(Creation);
@@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):

    include_header_file = "paddle/pten/api/include/api.h"
    source_file.write(source_include(include_header_file))
-    source_file.write(module_declare())
    source_file.write(namespace[0])

    for api in apis:

--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -202,7 +202,7 @@ HIGH_PARALLEL_JOB_NEW = [
    'test_fleet_runtime',
    'test_rnn_cudnn_params_packing',
    'test_mkldnn_placement_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
    'program_desc_test',
    'test_simplify_with_basic_ops_pass',
    'test_dygraph_mode_of_unittest',
@@ -1417,7 +1417,7 @@ CPU_PARALLEL_JOB = [
    'test_fc_mkldnn_op',
    'test_fc_lstm_fuse_pass',
    'test_fc_gru_fuse_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
    'test_fc_bf16_mkldnn_op',
    'test_executor_feed_non_tensor',
    'test_executor_check_feed',