Merge branch 'develop' into revert-37926-eager_coreops_500

883ee1a3 · wanghuancoder · GitHub · 823208fe · 11c785a4 · 883ee1a3
141 changed file
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -75,6 +75,11 @@ class Carrier final {

  bool IsInit() const;

+  // NOTE: This mutex will be used in interceptor's RunOps function.
+  // This mutex is used for avoiding forward ops and backward ops run
+  // simultaneously, which will lead to a random hang for some sync ops.
+  std::mutex run;
+
  DISABLE_COPY_AND_ASSIGN(Carrier);

 private:

--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"

 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }

 void ComputeInterceptor::RunOps() {
+  Carrier& carrier_instance = Carrier::Instance();
+  std::unique_lock<std::mutex> lock(carrier_instance.run);
  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
          << step_ + 1 << " time.";
  for (auto op : node_->ops()) {

--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #endif

+#ifdef PADDLE_WITH_IPU
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
  void operator()(const paddle::platform::NPUPinnedPlace& place) {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
    return device;
  }

+  inline ::DLDevice operator()(const platform::IPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::IPUPlace is not supported"));
+  }
+
  inline ::DLDevice operator()(const platform::XPUPlace &place) const {
    PADDLE_THROW(
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_ipu_place(place_)) {
+#ifdef PADDLE_WITH_IPU
+      gc.reset(new IPUGarbageCollector(
+          BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
 #endif
    } else if (platform::is_npu_place(place_)) {
 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
 cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
+cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
 cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass)
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)

--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace framework {
@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {

 REGISTER_PASS(fc_elementwise_layernorm_fuse_pass,
              paddle::framework::ir::FCElementwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(fc_elementwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .LE("elementwise_add", 1)
+            .EQ("layer_norm", 0));
--- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AvgShardPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter AvgShardPass::ApplyImpl";
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  if (ipu_backend->GetIpuStrategy()->need_avg_shard) {
+    VLOG(10) << "start AvgShardPass";
+    auto nodes = ir::TopologySortOperations(*graph);
+    auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus;
+
+    int shard_position = nodes.size() / num_ipus;
+    int index_and_stage = -1;
+    for (int i = 0; i < nodes.size(); i++) {
+      if ((i % shard_position) == 0 && index_and_stage < num_ipus - 1) {
+        index_and_stage++;
+      }
+      nodes[i]->Op()->SetAttr("ipu_index", index_and_stage);
+      nodes[i]->Op()->SetAttr("ipu_stage", index_and_stage);
+    }
+    VLOG(10) << "end AvgShardPass";
+  }
+
+  VLOG(10) << "leave AvgShardPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(avg_shard_pass, paddle::framework::ir::AvgShardPass);
--- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AvgShardPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ForwardGraphExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter ForwardGraphExtractPass::ApplyImpl";
+
+  std::unordered_map<OpRole, std::unordered_set<ir::Node*>> all_ops{
+      {OpRole::kForward, {}},  {OpRole::kBackward, {}},
+      {OpRole::kOptimize, {}}, {OpRole::kRPC, {}},
+      {OpRole::kDist, {}},     {OpRole::kLRSched, {}},
+      {OpRole::kLoss, {}},     {OpRole::kNotSpecified, {}}};
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto op_role = BOOST_GET_MUTABLE(int, node->Op()->GetAttr("op_role"));
+    if (op_role == static_cast<int>(OpRole::kForward)) {
+      all_ops[OpRole::kForward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kBackward)) {
+      all_ops[OpRole::kBackward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kOptimize)) {
+      all_ops[OpRole::kOptimize].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kRPC)) {
+    } else if (op_role == static_cast<int>(OpRole::kDist)) {
+    } else if (op_role == static_cast<int>(OpRole::kLRSched)) {
+    } else if (op_role == static_cast<int>(OpRole::kLoss)) {
+      all_ops[OpRole::kLoss].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kNotSpecified)) {
+      LOG(WARNING) << "Op: " << node->Name() << " OpRole is NotSpecified ";
+    }
+  }
+
+  std::unordered_set<ir::Node*> forward_vars;
+  std::unordered_set<ir::Node*> backward_vars;
+  std::unordered_set<ir::Node*> control_vars;
+  // forward_vars
+  for (auto& nodes : std::array<std::unordered_set<ir::Node*>, 2>{
+           all_ops[OpRole::kForward], all_ops[OpRole::kLoss]}) {
+    for (auto* node : nodes) {
+      for (auto* in_node : node->inputs) {
+        forward_vars.insert(in_node);
+      }
+      for (auto* out_node : node->outputs) {
+        forward_vars.insert(out_node);
+      }
+    }
+  }
+  // control_vars &  backward_vars
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    if (node->IsCtrlVar()) {
+      control_vars.insert(node);
+    }
+    for (auto* in_node : node->inputs) {
+      if (all_ops[OpRole::kOptimize].count(in_node)) {
+        backward_vars.insert(node);
+      }
+    }
+  }
+  // all removed node
+  std::unordered_set<ir::Node*> rm_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (backward_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (control_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kBackward].count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kForward].count(node) == 0 &&
+               all_ops[OpRole::kLoss].count(node) == 0 &&
+               forward_vars.count(node) == 0) {
+      rm_nodes.insert(node);
+    } else if (node->Name() == "feed" || node->Name() == "fetch") {
+      rm_nodes.insert(node);
+    }
+  }
+
+  VLOG(10) << "Remove Node: ";
+  for (auto* node : rm_nodes) {
+    // rm node releations
+    for (auto* node_in : node->inputs) {
+      for (size_t i = 0; i < node_in->outputs.size(); ++i) {
+        if (node_in->outputs[i] == node) {
+          node_in->outputs.erase(node_in->outputs.begin() + i);
+          break;
+        }
+      }
+    }
+    for (auto* node_out : node->outputs) {
+      for (size_t i = 0; i < node_out->inputs.size(); ++i) {
+        if (node_out->inputs[i] == node) {
+          node_out->inputs.erase(node_out->inputs.begin() + i);
+          break;
+        }
+      }
+    }
+    VLOG(10) << "\t" << node->Name();
+    graph->RemoveNode(node);
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  VLOG(10) << "leave ForwardGraphExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(forward_graph_extract_pass,
+              paddle::framework::ir::ForwardGraphExtractPass);
--- a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ForwardGraphExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferShapePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferShapePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  auto batch_size = ipu_backend->GetIpuStrategy()->batch_size;
+
+  auto feed_list = Get<std::vector<std::string>>("feed_list");
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    bool is_feed = std::find(feed_list.begin(), feed_list.end(),
+                             node->Name()) != feed_list.end();
+    if (is_feed) {
+      auto input_shape = node->Var()->GetShape();
+      if (input_shape[0] <= -1) {
+        input_shape[0] = batch_size;
+        node->Var()->SetShape(input_shape);
+      }
+      // int64->int32
+      if (node->Var()->GetDataType() == proto::VarType::INT64) {
+        node->Var()->SetDataType(proto::VarType::INT32);
+      }
+    }
+  }
+
+  // temp scope for shape inference
+  std::shared_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    auto var_desc = node->Var();
+    auto* ptr = scope->Var(var_desc->Name());
+    paddle::framework::InitializeVariable(ptr, var_desc->GetType());
+
+    auto tensor = ptr->GetMutable<paddle::framework::LoDTensor>();
+    tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape()));
+  }
+
+  // infer shape
+  auto nodes = ir::TopologySortOperations(*graph);
+  for (auto node : nodes) {
+    auto op_desc = node->Op();
+    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope);
+    op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx);
+
+    for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) {
+      for (int i = 0; i < it->second.size(); i++) {
+        auto output_name = op_desc->Output(it->first)[i];
+        auto dim =
+            it->second[i]->GetMutable<paddle::framework::LoDTensor>()->dims();
+        auto new_shape = paddle::framework::vectorize(dim);
+        for (auto output_node : node->outputs) {
+          if (output_node->Name() == output_name) {
+            output_node->Var()->SetShape(new_shape);
+          }
+        }
+      }
+    }
+  }
+  // release the temp scope
+  scope.reset();
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave InferShapePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_shape_pass, paddle::framework::ir::InferShapePass)
+    .RequirePassAttr("feed_list");
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferShapePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferencePostprocessPass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(10) << "enter InferencePostprocessPass::ApplyImpl";
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  auto *feed_var = new paddle::framework::VarDesc("feed");
+  feed_var->SetType(proto::VarType::FEED_MINIBATCH);
+  auto *feed_var_node = graph->CreateVarNode(feed_var);
+
+  auto *fetch_var = new paddle::framework::VarDesc("fetch");
+  fetch_var->SetType(proto::VarType::FETCH_LIST);
+  auto *fetch_var_node = graph->CreateVarNode(fetch_var);
+
+  for (int i = 0; i < feed_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == feed_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("feed");
+        op->SetInput("X", {"feed"});
+        op->SetOutput("Out", {node->Name()});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->inputs.push_back(op_node);
+        op_node->outputs.push_back(node);
+        feed_var_node->outputs.push_back(op_node);
+        op_node->inputs.push_back(feed_var_node);
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < fetch_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == fetch_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("fetch");
+        op->SetInput("X", {node->Name()});
+        op->SetOutput("Out", {"fetch"});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->outputs.push_back(op_node);
+        op_node->inputs.push_back(node);
+        fetch_var_node->inputs.push_back(op_node);
+        op_node->outputs.push_back(fetch_var_node);
+        break;
+      }
+    }
+  }
+
+  VLOG(10) << "leave InferencePostprocessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_postprocess_pass,
+              paddle::framework::ir::InferencePostprocessPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferencePostprocessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferenceProcessPass::ApplyImpl";
+
+  // Get a new instance of ipu_backend
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetNewInstance();
+
+  // Set scope
+  auto& scope = graph->Get<Scope>(kParamScopeAttr);
+  ipu_backend->SetScope(scope);
+
+  // Set ipu_strategy
+  static std::shared_ptr<platform::ipu::IpuStrategy> ipu_strategy_instance_(
+      new platform::ipu::IpuStrategy());
+  ipu_strategy_instance_->is_training = false;
+  auto num_ipus = graph->Get<int>("num_ipus");
+  ipu_strategy_instance_->num_ipus = num_ipus;
+  if (num_ipus > 1) {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Manual;
+  } else {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Off;
+  }
+
+  auto enable_pipelining = graph->Get<bool>("enable_pipelining");
+  ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining;
+  if (enable_pipelining) {
+    auto batches_per_step = graph->Get<int>("batches_per_step");
+    PADDLE_ENFORCE_GE(
+        batches_per_step, num_ipus,
+        platform::errors::InvalidArgument("Batched per step should be equal or "
+                                          "greater than the number of IPUs"));
+    ipu_strategy_instance_->batches_per_step = batches_per_step;
+  }
+  ipu_strategy_instance_->batch_size = graph->Get<int>("batch_size");
+  ipu_strategy_instance_->need_avg_shard = graph->Get<bool>("need_avg_shard");
+
+  ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
+
+  // Get feed_list and fetch list
+  std::vector<std::string> feed_list = {};
+  std::vector<std::string> fetch_list = {};
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list.push_back("");
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list.push_back("");
+      }
+    }
+  }
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->outputs[0]->Name();
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->inputs[0]->Name();
+      }
+    }
+  }
+
+  // Run passes
+  std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
+                                         "infer_shape_pass", "avg_shard_pass",
+                                         "popart_canonicalization_pass"};
+  std::vector<std::string> compile_pass = {
+      "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
+      "inference_postprocess_pass"};
+  for (auto pass_name : graph_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "infer_shape_pass") {
+      pass->Set("feed_list", new std::vector<std::string>(feed_list.begin(),
+                                                          feed_list.end()));
+    }
+    pass->Apply(graph);
+  }
+
+  for (auto pass_name : compile_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    pass->Set("feed_list",
+              new std::vector<std::string>(feed_list.begin(), feed_list.end()));
+    pass->Set("fetch_list", new std::vector<std::string>(fetch_list.begin(),
+                                                         fetch_list.end()));
+    pass->Apply(graph);
+  }
+
+  VLOG(10) << "leave InferenceProcessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_process_pass,
+              paddle::framework::ir::InferenceProcessPass);
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.h
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferenceProcessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuGraphBuilderPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  ipu_backend->Compile(graph, feed_list, fetch_list);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuGraphBuilderPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_graph_builder_pass,
+              paddle::framework::ir::IpuGraphBuilderPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuGraphBuilderPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::string GenerateVarName(Node *node) {
+  return node->Name() + "_" + std::to_string(node->id());
+}
+
+void IpuInplacePass::ApplyImpl(ir::Graph *graph) const {
+  // use this pass after forward_graph_extract_pass
+  // raise error if the inplaced var both in feed_list & fetch_list
+  VLOG(10) << "enter IpuInplacePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::map<std::string, int> var_name;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name.find(node->Name()) == var_name.end()) {
+        var_name.emplace(node->Name(), 1);
+      } else {
+        var_name[node->Name()]++;
+      }
+    }
+  }
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name[node->Name()] > 1) {
+        auto is_feed = (std::find(feed_list.begin(), feed_list.end(),
+                                  node->Name()) != feed_list.end()) &&
+                       (node->inputs.size() == 0);
+        auto is_fetch = (std::find(fetch_list.begin(), fetch_list.end(),
+                                   node->Name()) != fetch_list.end()) &&
+                        (node->outputs.size() == 0);
+        if (!is_feed && !is_fetch && !node->Var()->Persistable()) {
+          auto old_name = node->Name();
+          auto new_name = GenerateVarName(node);
+          node->RenameVar(new_name);
+          for (auto *op_in : node->inputs) {
+            op_in->Op()->RenameOutput(old_name, new_name);
+          }
+          for (auto *op_out : node->outputs) {
+            op_out->Op()->RenameInput(old_name, new_name);
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuInplacePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_inplace_pass, paddle::framework::ir::IpuInplacePass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuInplacePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IPUPassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IPUPassBase : public Pass {
+ public:
+  void Init(const std::string& repr, Graph* graph) const;
+  virtual ~IPUPassBase() {}
+
+ protected:
+  mutable Graph* graph_;
+  mutable std::string repr_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuRuntimeReplacerPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  framework::OpDesc ipu_rt_op_desc;
+  ipu_rt_op_desc.SetType("ipu_runtime");
+  ipu_rt_op_desc.SetInput("FeedList", feed_list);
+  ipu_rt_op_desc.SetOutput("FetchList", fetch_list);
+  ipu_rt_op_desc.Flush();
+
+  // Create a new node for the ipu_runtime_op.
+  auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      for (auto feed : feed_list) {
+        if (node->Name() == feed) {
+          IR_NODE_LINK_TO(node, ipu_rt_node);
+        }
+      }
+      for (auto fetch : fetch_list) {
+        if (node->Name() == fetch) {
+          IR_NODE_LINK_TO(ipu_rt_node, node);
+        }
+      }
+    }
+  }
+
+  // set ipu_runtime_op dtype attr
+  if (fetch_list.size() == 1) {
+    for (auto* node : graph->Nodes()) {
+      if (node->IsVar()) {
+        for (auto fetch : fetch_list) {
+          if (node->Name() == fetch) {
+            ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType());
+          }
+        }
+      }
+    }
+  }
+
+  // Remove unneeded nodes.
+  std::unordered_set<const Node*> marked_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op_desc = node->Op();
+      if (op_desc->Type() != "ipu_runtime") {
+        marked_nodes.insert(node);
+      }
+    }
+  }
+
+  GraphSafeRemoveNodes(graph, marked_nodes);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuRuntimeReplacerPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_runtime_replacer_pass,
+              paddle::framework::ir::IpuRuntimeReplacerPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
--- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuRuntimeReplacerPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      // graph usually have multiple optimizer node for different parameter,
+      // and these node have the same type and attr value usually
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type());
+        VLOG(10) << "found optimizer type: " << node->Op()->Type();
+
+        for (const std::string& attr_name : node->Op()->AttrNames()) {
+          auto attr_type = node->Op()->GetAttrType(attr_name);
+          // with adam, attr are float
+          if (attr_type == proto::AttrType::FLOAT) {
+            auto attr_value =
+                BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name));
+            ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value);
+          } else {
+            VLOG(10) << "Skip " << attr_type;
+          }
+        }
+
+        auto lr_var_name = node->Op()->Input("LearningRate");
+        PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u,
+                          platform::errors::InvalidArgument(
+                              "In op(%s), find input(LearningRate) failed.",
+                              node->Op()->Type()));
+
+        ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]);
+      }
+
+      if ((op_role == static_cast<int>(framework::OpRole::kLoss))) {
+        VLOG(10) << "found loss op type: " << node->Op()->Type();
+        auto outputs = node->Op()->Outputs();
+        PADDLE_ENFORCE_EQ(
+            outputs.size(), 1,
+            platform::errors::InvalidArgument("Can only support one loss key"));
+
+        auto losses_name = outputs.begin()->second;
+        PADDLE_ENFORCE_EQ(losses_name.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "Can only support one loss name"));
+
+        ipu_backend->GetExecutor().SetLoss(losses_name[0]);
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_extract_pass,
+              paddle::framework::ir::IpuOptimizerExtractPass);
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuOptimizerExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using paddle::platform::ipu::IpuBackend;
+using framework::ir::Graph;
+using framework::ir::Node;
+
+void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = IpuBackend::GetInstance();
+  const auto* scope_ = ipu_backend->GetScope();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        auto inputs = node->Op()->Inputs();
+        if (inputs.count(platform::ipu::sBeta1Pow)) {
+          auto var = scope_->GetVar(inputs.at(platform::ipu::sBeta1Pow)[0]);
+          auto data = var->GetMutable<framework::LoDTensor>()->data<float>();
+          auto beta = BOOST_GET_CONST(
+              float, node->Op()->GetAttr(platform::ipu::sBeta1));
+
+          // ensure current save with beta1pow, rather than step.
+          // beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart
+          // support single Step__
+          bool save_with_beta1pow = (data[0] < 1.0f) && (data[0] > 0.0f);
+          float step = 0;
+          float beta_acc = beta;
+          while (beta_acc > data[0] && save_with_beta1pow) {
+            beta_acc *= beta;
+            step += 1;
+          }
+
+          if (save_with_beta1pow) {
+            data[0] = step;
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerStateAlignPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_state_align_pass,
+              paddle::framework::ir::IpuOptimizerStateAlignPass);
--- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * This pass should only affect optimizer that need bias correction,
+ * include Adam/Lamb.
+ */
+
+class IpuOptimizerStateAlignPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using platform::ipu::SymbolHandler;
+
+void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto nodes = graph->Nodes();
+  for (auto* node : nodes) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto* op = node->Op();
+    auto op_type = op->Type();
+
+    ir::Node* new_node = nullptr;
+    SymbolHandler handler = platform::ipu::GetHandler(op_type);
+    if (handler) {
+      VLOG(11) << "Raw Paddle Node:";
+      VLOG(11) << node->Op()->Proto()->DebugString();
+      new_node = handler(graph, node);
+      VLOG(11) << "Post Popart Node:";
+      VLOG(11) << new_node->Op()->Proto()->DebugString();
+
+      platform::ipu::ClearNode(node);
+      graph->RemoveNode(node);
+    } else {
+      LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type;
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(popart_canonicalization_pass,
+              paddle::framework::ir::PopartCanonicalizationPass);
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PopartCanonicalizationPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
      LOG(WARNING) << "Pass in op compat failed.";
      return;
    }
+
    const int kNumFields = 5;
    const int kTransOffset = 1;
    const int kTransOutOffset = 2;
    const int kFlattenOffset = 3;
    const int kFlattenOutOffset = 4;
-    std::vector<Node *> nodes;

+    std::vector<Node *> nodes;
+    std::vector<int> trans_axis0;
+    int flatten_axis0;
    for (int i = 0; i < times; i++) {
      PADDLE_ENFORCE_NOT_NULL(
          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
          platform::errors::NotFound("Can not find %s in subgraph.",
                                     input_nodes[i]->name()));

+      if (i == 0) {
+        trans_axis0 = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(0)))
+                ->Op()
+                ->GetAttr("axis"));
+        flatten_axis0 = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+      } else {
+        std::vector<int> trans_axis = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))
+                ->Op()
+                ->GetAttr("axis"));
+        // All axis of transpose should be the same
+        if (trans_axis0 != trans_axis) return;
+
+        int flatten_axis = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+        // All axis of flatten should be the same
+        if (flatten_axis0 != flatten_axis) return;
+      }
+
      nodes.push_back(subgraph.at(input_nodes[i]));
      nodes.push_back(
          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

+#define REGISTER_OP_IPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, IPU, ::paddle::platform::IPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)


--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
 cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
 cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)

 if (WITH_TESTING)
  cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)

--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -29,55 +29,32 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {

+using GraphHashStrategy = CinnCacheKey::GraphHashStrategy;
+
+CinnCacheKey::CinnCacheKey(GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {}
+
 CinnCacheKey::CinnCacheKey(
    const ir::Graph& graph,
    const std::map<std::string, const LoDTensor*>& input_tensors,
-    const std::string& arch_str) {
+    const std::string& arch_str, GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
  this->SetKey(graph, input_tensors, arch_str);
 }

 CinnCacheKey::CinnCacheKey(const ir::Graph& graph,
                           const std::map<std::string, DDim>& input_shapes,
-                           const std::string& arch_str) {
+                           const std::string& arch_str,
+                           GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
  this->SetKey(graph, input_shapes, arch_str);
 }

-size_t CinnCacheKey::HashGraph(const ir::Graph& graph) {
-  // using Dot to unqiue graph
-  inference::analysis::Dot dot;
-  std::unordered_map<const ir::Node*, std::string> node2dot;
-  int id = 0;
-  // Create nodes
-  // graph.Nodes() return unordered_set, the same graph may
-  // return different result?
-  for (const ir::Node* n : graph.Nodes()) {
-    std::string node_id = std::to_string(id++);
-    dot.AddNode(node_id, {}, n->Name(), true);
-    node2dot[n] = node_id;
-  }
-
-  // Create edges
-  for (const ir::Node* n : graph.Nodes()) {
-    const auto& src_id = node2dot.at(n);
-    for (auto* out : n->outputs) {
-      const auto& dest_id = node2dot.at(out);
-      dot.AddEdge(src_id, dest_id, {});
-    }
-  }
-
-  const std::string& viz_graph = dot.Build();
-  VLOG(1) << "The hash graph:\n" << viz_graph;
-
-  size_t hash_val = std::hash<std::string>()(viz_graph);
-  VLOG(4) << "The graph's hash value is: " << hash_val;
-  return hash_val;
-}
-
 void CinnCacheKey::SetKey(
    const ir::Graph& graph,
    const std::map<std::string, const LoDTensor*>& input_tensors,
    const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
  for (const auto& name_tensor : input_tensors) {
    input_shapes_[name_tensor.first] = name_tensor.second->dims();
  }
@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
 void CinnCacheKey::SetKey(const ir::Graph& graph,
                          const std::map<std::string, DDim>& input_shapes,
                          const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
  input_shapes_ = input_shapes;
  arch_str_ = arch_str;
 }
@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
 }

 bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
-  return graph_serialize_str_ == other.graph_serialize_str_ &&
+  return graph_hash_val_ == other.graph_hash_val_ &&
         input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }

@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
    ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
  }

-  ret = hash_combine(ret, string_hasher(key.graph_serialize_str_));
+  ret = hash_combine(ret, key.graph_hash_val_);
  ret = hash_combine(ret, string_hasher(key.arch_str_));
  return ret;
 }

+size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
+  // sort grad node by name and id.
+  auto compare = [](ir::Node* n1, ir::Node* n2) {
+    return (n1->Name() == n2->Name()) ? (n1->id() < n2->id())
+                                      : (n1->Name() < n2->Name());
+  };
+
+  // graph.Nodes() return unordered_set, here using set to avoid the same graph
+  // may return different result
+  std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
+      output_set(compare);
+  node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
+
+  std::string hash_str;
+  for (ir::Node* n : node_set) {
+    hash_str.append(n->Name());
+
+    output_set.clear();
+    output_set.insert(n->outputs.begin(), n->outputs.end());
+    for (auto* out : output_set) {
+      hash_str.append(out->Name());
+    }
+  }
+
+  VLOG(1) << "The hash graph:\n" << hash_str;
+
+  size_t hash_val = std::hash<std::string>()(hash_str);
+  VLOG(4) << "The graph's hash value by graph structure is: " << hash_val;
+  return hash_val;
+}
+
+size_t CinnCacheKeyByAddress::HashGraph(const ir::Graph& graph) {
+  size_t hash_val = reinterpret_cast<size_t>(&graph);
+  VLOG(4) << "The graph's hash value by graph address is: " << hash_val;
+  return hash_val;
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <functional>
 #include <map>

 #include "paddle/fluid/framework/ddim.h"
@@ -33,14 +34,18 @@ namespace paddle2cinn {
 // shapes.
 class CinnCacheKey {
 public:
+  using GraphHashStrategy = std::function<size_t(const ir::Graph&)>;
+
+  explicit CinnCacheKey(GraphHashStrategy graph_hash);
+
  CinnCacheKey(const ir::Graph& graph,
               const std::map<std::string, const LoDTensor*>& input_tensors,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
  CinnCacheKey(const ir::Graph& graph,
               const std::map<std::string, DDim>& input_shapes,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);

-  ~CinnCacheKey() {}
+  ~CinnCacheKey() = default;

  void SetKey(const ir::Graph& graph,
              const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -58,13 +63,38 @@ class CinnCacheKey {
  };

 private:
-  size_t HashGraph(const ir::Graph& graph);
-
-  std::string graph_serialize_str_;
+  GraphHashStrategy graph_hash_;
+  size_t graph_hash_val_;
  std::map<std::string, DDim> input_shapes_;
  std::string arch_str_;
 };

+#define CINN_CACHE_KEY_CREATE(NAME)                                    \
+  class NAME : public CinnCacheKey {                                   \
+   public:                                                             \
+    NAME() : CinnCacheKey(HashGraph) {}                                \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, const LoDTensor*>& input_tensors, \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {}   \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, DDim>& input_shapes,              \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {}    \
+                                                                       \
+   private:                                                            \
+    static size_t HashGraph(const ir::Graph& graph);                   \
+  };
+
+// Class to store the keys by graph address for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByAddress)
+// Class to store the keys by graph structure for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByStructure)
+
+#undef CINN_CACHE_KEY_CREATE
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -26,8 +26,8 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {

-TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
-  std::unordered_set<CinnCacheKey, CinnCacheKey::Hash> test_set;
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) {
+  std::unordered_set<CinnCacheKeyByStructure, CinnCacheKey::Hash> test_set;

  ProgramDesc empty_program;
  ir::Graph empty_graph(empty_program);
@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
  DDim ddim = paddle::framework::make_ddim({1, 2, 3});
  std::map<std::string, DDim> feed_shapes = {{"X", ddim}};

-  CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86");
-  CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key1(empty_graph, feed_shapes, "x86");
  EXPECT_EQ(cache_key0, cache_key1);

-  CinnCacheKey cache_key2(graph, feed_shapes, "x86");
-  CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu");
-  CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu");
+  CinnCacheKeyByStructure cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByStructure cache_key4(graph, feed_tensors, "nvgpu");
  EXPECT_NE(cache_key2, cache_key3);
  EXPECT_EQ(cache_key3, cache_key4);

-  CinnCacheKey cache_key5(empty_graph,
-                          std::map<std::string, const LoDTensor *>(), "unk");
-  CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>(), "unk");
+  CinnCacheKeyByStructure cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByStructure cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                     "unk");
  EXPECT_EQ(cache_key5, cache_key6);

  EXPECT_NE(cache_key1, cache_key3);
@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
  EXPECT_EQ(test_set.find(cache_key6), test_set.end());
 }

+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) {
+  std::unordered_set<CinnCacheKeyByAddress, CinnCacheKey::Hash> test_set;
+
+  ProgramDesc empty_program;
+  ir::Graph empty_graph(empty_program);
+
+  ProgramDesc program;
+  auto *global_block = program.MutableBlock(0);
+  auto *x = global_block->Var("X");
+  x->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph(program);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  DDim ddim = paddle::framework::make_ddim({1, 2, 3});
+  std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
+
+  CinnCacheKeyByAddress cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key1(empty_graph, feed_shapes, "x86");
+  EXPECT_EQ(cache_key0, cache_key1);
+
+  CinnCacheKeyByAddress cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByAddress cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByAddress cache_key4(graph, feed_tensors, "nvgpu");
+  EXPECT_NE(cache_key2, cache_key3);
+  EXPECT_EQ(cache_key3, cache_key4);
+
+  CinnCacheKeyByAddress cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByAddress cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                   "unk");
+  EXPECT_EQ(cache_key5, cache_key6);
+
+  EXPECT_NE(cache_key1, cache_key3);
+  EXPECT_NE(cache_key4, cache_key2);
+
+  EXPECT_NE(cache_key3, cache_key5);
+  EXPECT_NE(cache_key6, cache_key4);
+
+  EXPECT_NE(cache_key5, cache_key1);
+  EXPECT_NE(cache_key2, cache_key6);
+
+  test_set.insert(cache_key0);
+  test_set.insert(cache_key1);
+  test_set.insert(cache_key3);
+  test_set.insert(cache_key4);
+  test_set.insert(cache_key5);
+  test_set.insert(cache_key6);
+  EXPECT_EQ(test_set.size(), 3U);
+
+  auto iter = test_set.find(cache_key0);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 2U);
+  EXPECT_EQ(test_set.find(cache_key1), test_set.end());
+
+  iter = test_set.find(cache_key3);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 1U);
+  EXPECT_EQ(test_set.find(cache_key4), test_set.end());
+
+  iter = test_set.find(cache_key5);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 0U);
+  EXPECT_EQ(test_set.find(cache_key6), test_set.end());
+}
+
+TEST(CinnCacheKeyTest, TestSameGraph) {
+  ProgramDesc program1;
+  auto *global_block1 = program1.MutableBlock(0);
+  auto *x1 = global_block1->Var("X");
+  x1->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph1(program1);
+
+  ProgramDesc program2;
+  auto *global_block2 = program2.MutableBlock(0);
+  auto *x2 = global_block2->Var("X");
+  x2->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph2(program2);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  CinnCacheKeyByAddress cache_key_by_address1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key_by_address2(graph2, feed_tensors, "x86");
+  EXPECT_NE(cache_key_by_address1, cache_key_by_address2);
+
+  CinnCacheKeyByStructure cache_key_by_struct1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key_by_struct2(graph2, feed_tensors, "x86");
+  EXPECT_EQ(cache_key_by_struct1, cache_key_by_struct2);
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -41,6 +41,7 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"

@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
    const std::map<std::string, const LoDTensor*>& input_tensors,
    const Target& target, void* stream) {
  VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
-  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
+  CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors,
+                                           target.arch_str());
+  CinnCacheKeyByStructure cur_key_by_struct;
+
  bool exist = false;
  {
    AutoRDLock r_guard{&rwlock_};
-    exist = cache_.count(cur_key) != 0;
+    exist = cache_by_address_.count(cur_key_by_address) != 0;
+    // if cannot find graph by address, checkout whether the graph structure
+    // have been stored in cache.
+    if (!exist) {
+      // generate the structure cache key
+      cur_key_by_struct.SetKey(graph, input_tensors, target.arch_str());
+
+      // if the graph structure can be found, storing the graph address in
+      // cache for next query.
+      if (cache_by_struct_.count(cur_key_by_struct) != 0) {
+        exist = true;
+        cache_by_address_[cur_key_by_address] =
+            cache_by_struct_.at(cur_key_by_struct).get();
+      }
+    }
  }
  if (!exist) {
    std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
    auto compiled_res =
        CompileGraph(graph, input_tensors, target, compiled_num, stream);
    AutoWRLock w_guard{&rwlock_};
-    if (!cache_.count(cur_key)) {
-      cache_[cur_key] = std::move(compiled_res);
+    if (!cache_by_struct_.count(cur_key_by_struct)) {
+      cache_by_address_[cur_key_by_address] = compiled_res.get();
+      cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
    }
  }
  AutoRDLock guard{&rwlock_};
-  const auto& cached_boj = *cache_[cur_key];
+  const auto& cached_boj = *cache_by_address_[cur_key_by_address];
  return cached_boj;
 }

@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
  {
    AutoWRLock guard{&rwlock_};
    graphs_.clear();
-    cache_.clear();
+    cache_by_address_.clear();
+    cache_by_struct_.clear();
  }
  real_compiled_num_.store(0);
 }
@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
  *compiled_obj = {std::move(graph_compiler),
                   std::move(compiled_res.runtime_program), scope,
                   symbol.var_model_to_program_map()};
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(
+          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
  return compiled_obj;
 }


--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -31,6 +31,13 @@
 #include "paddle/fluid/platform/macros.h"

 namespace paddle {
+
+namespace operators {
+namespace details {
+class CinnLaunchContext;
+}  // namespace details
+}  // namespace operators
+
 namespace framework {
 namespace paddle2cinn {

@@ -39,6 +46,7 @@ struct CinnCompiledObject {
  std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
  std::shared_ptr<::cinn::hlir::framework::Scope> scope;
  std::unordered_map<std::string, std::string> paddle2cinn_varmap;
+  std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
 };

 // Entrance to use CINN.
@@ -87,9 +95,12 @@ class CinnCompiler {
      void* stream = nullptr) const;

  std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
-  std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
+  std::unordered_map<CinnCacheKeyByAddress, CinnCompiledObject*,
                     CinnCacheKey::Hash>
-      cache_;
+      cache_by_address_;
+  std::unordered_map<CinnCacheKeyByStructure,
+                     std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
+      cache_by_struct_;
  std::atomic_int64_t real_compiled_num_{0};
  mutable RWLock rwlock_;


--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <sstream>

 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"

 #include "paddle/fluid/framework/lod_tensor.h"
@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }

 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
-                         GetAttrsArgsNames(), GetOutputArgsNames());
+  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
+                         GetInputArgsNames(), GetAttrsArgsNames(),
+                         GetOutputArgsNames());
 }

 std::string KernelSignatureToString(const KernelSignature& signature) {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
  }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_ipu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  }
+#endif
+
 #ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
@@ -386,17 +402,33 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
  }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    }
    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else {  // NOLINT
+  }
+  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
        platform::errors::Unimplemented("Not supported on place (%s) ", npu));
    // return GetResultHelper(out, npu);
  }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not supported on place (%s) ", ipu));
+  }

  bool GetResult(const framework::Tensor& out,
                 const platform::NPUPinnedPlace& cpu) const {
@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
  void VisitorImpl(const platform::XPUPlace& xpu) const {
    PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
  }
+  void VisitorImpl(const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+  }

  void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
        "is not supported in imperative mode",
        place));
  }
+  // there is NO support in IPUPlace
+  void operator()(const platform::IPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }

 private:
  int64_t numel_;

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
        op.Type(), outs, dev_ctx->GetPlace());
  }

+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
  /**
   * [ Why need handle complex gradient to real gradient? ]
   *

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -211,70 +211,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif

-// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
-// If later the operators::StridedMemcpyWithAxis0 is supported,
-// then this specific SplitTensorsForAllReduce can be removed.
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-void SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors) {
-  auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
-  std::vector<framework::Tensor *> outs;
-  std::vector<const framework::Tensor *> shape_refer;
-
-  outs.reserve(p_dense_tensors->size());
-  shape_refer.reserve(p_dense_tensors->size());
-
-  for (auto &tensor : *p_dense_tensors) {
-    outs.emplace_back(&tensor);
-    shape_refer.emplace_back(&tensor);
-  }
-  operators::math::SplitFunctor<platform::NPUDeviceContext, float>
-      split_functor_;
-  split_functor_(context, *in, shape_refer, 0, &outs);
-}
-
-template <>
-void ConcatTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
-    framework::Variable *p_dense_contents,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      ConcatTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, dense_tensors_, p_dense_contents);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it concats tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-
-template <>
-void SplitTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, p_dense_contents, p_dense_tensors);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it splits tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-#endif
-
 void Group::ConcatTensors(const platform::DeviceContext &context) {
  auto place = context.GetPlace();
  if (platform::is_gpu_place(place)) {

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {

  const AllocatorMap& GetAllocatorMap() {
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
      auto id = platform::CUDAGraph::CapturingID();
      auto iter = cuda_graph_allocator_map_.find(id);
      PADDLE_ENFORCE_NE(
          iter, cuda_graph_allocator_map_.end(),
          platform::errors::PermissionDenied(
              "No memory pool is prepared for CUDA Graph capturing."));
+      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
      return iter->second->allocators_;
    } else {
      return allocators_;
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_HIP)
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
+        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
 #endif

 #if defined(PADDLE_WITH_CUDA)

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
 }

+// For Graphcore IPU
+template <>
+void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "IPUPlace, Allocate on cpu.";
+
+  void *p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+template <>
+void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+                              size_t size) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+template <>
+uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Release();
+}
+template <>
+size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
 // For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {

--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -32,9 +32,34 @@ CinnLaunchContext::CinnLaunchContext(
      [](const auto& name_view) { return std::string(name_view.data()); });
 }

-bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
-  return paddle2cinn_varmap_.count(paddle_name) > 0 &&
-         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
+void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
+                                          const platform::Place& place) {
+  if (std::addressof(scope) == cached_scope_ &&
+      std::addressof(place) == cached_place_) {
+    VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
+            << " are not changed";
+    return;
+  }
+  cached_scope_ = std::addressof(scope);
+  cached_place_ = std::addressof(place);
+  cached_temp_scope_ = scope.NewTmpScope();
+  VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
+          << std::addressof(scope) << ", place:" << cached_place_ << "->"
+          << std::addressof(place);
+}
+
+bool CinnLaunchContext::IsArgumentsInitialized() const {
+  if (hold_buffers_.empty() || name2argument_.empty()) {
+    return false;
+  }
+  return true;
+}
+
+bool CinnLaunchContext::IsVariableUsed(
+    const std::string& paddle_var_name) const {
+  return paddle2cinn_varmap_.count(paddle_var_name) > 0 &&
+         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_var_name)) >
+             0;
 }

 CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& var_name) {
@@ -53,99 +78,101 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
  return all_parameters;
 }

-void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& paddle_var_name, const LoDTensor& paddle_tensor,
+    const CinnTensor& cinn_tensor) {
  // check dimension
  auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
  PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                    platform::errors::PreconditionNotMet(
                        "Tensors' shape in variable(%s) are not equivalent, "
                        "paddle's shape = [%s], but cinn's shape = [%s].",
-                        paddle_name, paddle_tensor.dims(), cinn_dims));
+                        paddle_var_name, paddle_tensor.dims(), cinn_dims));

  // TODO(CtfGo): check the underlying data type after CINN ready
 }

-void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
-                    platform::errors::InvalidArgument(
-                        "Paddle variable(%s) not used by cinn", paddle_name));
-
-  const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+void CinnLaunchContext::AssignExternalVariable(
+    const std::string& paddle_var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(paddle_var_name), true,
+      platform::errors::InvalidArgument("Paddle variable(%s) not used by cinn",
+                                        paddle_var_name));
+
+  const auto& cinn_var_name = paddle2cinn_varmap_.at(paddle_var_name);
+  const auto& paddle_tensor =
+      cached_scope_->GetVar(paddle_var_name)->Get<LoDTensor>();
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
+  if (paddle_tensor.IsInitialized()) {
+    CheckTensorEquivalent(paddle_var_name, paddle_tensor, cinn_tensor);
  }
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
-                     paddle_tensor);
-}

-void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn socpe.", cinn_name));
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
-  }
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
-                     paddle_tensor);
-}
+  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [this, paddle_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_scope_->GetVar(paddle_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
+        return 0;
+      });

-std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
-  // convert paddle dimensions array to cinn format
-  std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
-  for (auto i = 0; i < tensor->dims().size(); ++i) {
-    cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
-  }
+  // external variables will be recycled by global gc, so do nothing here
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });

+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
+}
+
+void CinnLaunchContext::AssignInternalVariable(
+    const std::string& cinn_var_name) {
+  PADDLE_ENFORCE_GT(
+      cinn_variable_names_.count(cinn_var_name), 0,
+      platform::errors::InvalidArgument("Variable(%s) not found in cinn socpe.",
+                                        cinn_var_name));
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign size and memory
-  cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());

  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
-        buffer->memory =
-            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->Var(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
        return 0;
      });

-  if (free_mem_callback) {
-    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-        [tensor](void* ctx, cinn_buffer_t* buffer) {
-          tensor->clear();
-          return 0;
-        });
-    return cinn_buffer;
-  }
-
+  // internal variables should release its buffer immediately
+  // if no instruction use it
  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-      [](void* ctx, cinn_buffer_t* buffer) {
-        // Do nothing
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->GetVar(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->clear();
        return 0;
      });
-  return cinn_buffer;
+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
 }

-void CinnLaunchContext::SetArgument(const std::string& cinn_name,
-                                    const platform::Place& place,
-                                    bool free_mem_callback,
-                                    LoDTensor* paddle_tensor) {
-  auto buffer =
-      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
-  name2argument_.emplace(cinn_name, buffer.get());
+void CinnLaunchContext::SetArgument(const std::string& cinn_var_name,
+                                    std::unique_ptr<cinn_buffer_t>&& buffer) {
+  VLOG(4) << "SetArgument-" << name2argument_.size() << ": name("
+          << cinn_var_name << "), dims("
+          << framework::DDim(buffer->dims, buffer->dimensions) << ").";
+
+  name2argument_.emplace(cinn_var_name, buffer.get());
  hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
-          << ").";
 }

 const std::map<std::string, cinn_pod_value_t>&

--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -24,7 +24,7 @@
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/scope.h"

 namespace paddle {
 namespace operators {
@@ -40,16 +40,22 @@ class CinnLaunchContext {
      const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
      const std::shared_ptr<CinnScope>& cinn_scope);

+  // explicitly update several environment variables captured
+  // by callback of execution arguments
+  void UpdateCapturedEnv(const framework::Scope& scope,
+                         const platform::Place& place);
+
+  // Return whether execution arguments has been initialized
+  bool IsArgumentsInitialized() const;
+
  // Return whether a Paddle variable used on compiled kernels
-  bool IsVariableUsed(const std::string& var_name);
+  bool IsVariableUsed(const std::string& paddle_var_name) const;

  // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& paddle_var_name);

  // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& cinn_var_name);

  // Extract internal variable names from CinnScope
  // by excluding used input and output variables
@@ -58,10 +64,6 @@ class CinnLaunchContext {
  // Finalize all execution arguments and return them
  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;

-  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
-    return std::move(hold_buffers_);
-  }
-
 private:
  // Get CinnTensor with CINN variable name
  CinnTensor GetCinnTensor(const std::string& var_name);
@@ -72,16 +74,15 @@ class CinnLaunchContext {
                             const LoDTensor& paddle_tensor,
                             const CinnTensor& cinn_tensor);

-  // Share the buffer of a Paddle tensor to CINN by delivering memory address
-  // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
-      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
-
-  // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, const platform::Place& place,
-                   bool free_mem_callback, LoDTensor* paddle_tensor);
+  // Set an argument with (cinn name)->(cinn_buffer_t) pair
+  void SetArgument(const std::string& cinn_var_name,
+                   std::unique_ptr<cinn_buffer_t>&& buffer);

 private:
+  const framework::Scope* cached_scope_ = nullptr;
+  const platform::Place* cached_place_ = nullptr;
+  std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
+
  // a variable name map from paddle to cinn
  const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
  // the variable scope of cinn

--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
  return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
 }

-TEST(CinnLaunchContextTest, TestIsVariableUsed) {
+TEST(CinnLaunchContextTest, TestBasic) {
  auto launch_context = CreateDefaultLaunchContext();
-
+  // test IsVariableUsed
  ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
  ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-}
-
-TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
-  auto launch_context = CreateDefaultLaunchContext();
-  auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 3);
-  EXPECT_NE(internal_variable_names.find("cinn_var2"),
-            internal_variable_names.end());
+  // test UpdateCapturedEnv
+  platform::CPUPlace place;
+  framework::Scope scope;
+  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
+  // test IsArgumentsInitialized
+  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
 }

 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
-  auto launch_context = CreateDefaultLaunchContext();
  platform::CPUPlace place;
  framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();

  // CheckTensorEquivalent: tensor dimension not equivalent
  tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
               paddle::platform::EnforceNotMet);
 }

 TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
-  auto launch_context = CreateDefaultLaunchContext();
  platform::CPUPlace place;
  framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
  auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();

  // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
               paddle::platform::EnforceNotMet);
  // not found
-  ASSERT_THROW(
-      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
-      paddle::platform::EnforceNotMet);
+  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
+               paddle::platform::EnforceNotMet);
 }

 TEST(CinnLaunchContextTest, TestSetArgument) {
+  platform::CPUPlace cpu_place;
+  platform::Place place(cpu_place);
+  framework::Scope scope;
  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);

-  platform::CPUPlace place;
-  framework::Scope scope;
+  // assign external variables
  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
  float* data1 =
      tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
  data1[0] = 9.99f;
  data1[10] = 19.99f;
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));

-  // assign external variable
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var1", place, tensor1));
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
-  // FinalizeArguments not missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
  auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
  tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var3", place, tensor3));
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
+
+  // FinalizeArguments missed check
+  ASSERT_THROW(launch_context->FinalizeArguments(),
+               paddle::platform::EnforceNotMet);
+  // test get internal variables
+  auto internal_variable_names = launch_context->GetInternalVariableNames();
+  ASSERT_EQ(internal_variable_names.size(), 1);
+  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");

+  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
+  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
+  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
+
+  // check argument is set correctly and alloc/free callbacks work well
  auto name2argument = launch_context->FinalizeArguments();
  ASSERT_EQ(name2argument.size(), 3);
  ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  // check ShareTensorWithCinnBuffer
+  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
+
  auto* cinn_buffer =
      static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
-
  ASSERT_EQ(cinn_buffer->memory, nullptr);
  cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
  ASSERT_NE(cinn_buffer->memory, nullptr);

--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -31,26 +31,6 @@ namespace operators {
 namespace details {

 #ifdef PADDLE_WITH_CUDA
-void CUDART_CB ReleaseScope(void* data) {
-  auto* temp_scope = static_cast<framework::Scope*>(data);
-  delete temp_scope;
-}
-
-void CUDART_CB ReleaseBuffers(void* data) {
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
-  delete buffers;
-}
-
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
-}
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx) {

--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();

-template <typename DeviceContext>
-void ReleaseResource(const std::vector<void*>& resources, void* stream) {
-  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
-  delete temp_scope;
-  delete buffers;
-}
-
 template <typename DeviceContext>
 void* GetStream(const framework::ExecutionContext& ctx) {
  return nullptr;
 }

 #ifdef PADDLE_WITH_CUDA
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream);
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx);
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
        compilation_key, inputs_name2tensor, target, stream);
    details::DebugCinnCompiledResult(cinn_compiled_object);

-    auto launch_context = std::make_unique<details::CinnLaunchContext>(
-        cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
-
+    auto* launch_context = cinn_compiled_object.launch_context.get();
    // Step 3. Prepare arguments needed for the compiled executable program.
-    VLOG(4) << "CinnLaunchOp prepare arguments";
-
-    // 3.1 Prepare input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_variable_names) {
-      if (!launch_context->IsVariableUsed(var_name)) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
-        continue;
+    launch_context->UpdateCapturedEnv(scope, place);
+    if (!launch_context->IsArgumentsInitialized()) {
+      VLOG(4) << "CinnLaunchOp prepare arguments";
+
+      // 3.1 Prepare input variables: tensors of input variables have
+      //     been initialized before graph compiled, just check the
+      //     equiality between tensors of paddle and cinn.
+      for (const auto& var_name : input_variable_names) {
+        if (!launch_context->IsVariableUsed(var_name)) {
+          // some input variables don't need for cinn because they are
+          // eliminated by optimized passes or some cinn operators use
+          // less variables
+          VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
+          continue;
+        }
+
+        launch_context->AssignExternalVariable(var_name);
      }

-      launch_context->AssignExternalVariable(
-          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
-    }
-
-    // 3.2 Prepare output variables: all output variables should
-    //     be initialized and allocated buffer before
-    //     the runtime program start execution, the compilation result
-    //     includes details of their buffer assginment and we use that to
-    //     allocate space in Paddle. For those variables allocated yet,
-    //     like persistable parameters, just check the equiality between
-    //     Paddle allocation and CINN buffer assginment.
-    auto output_variable_names = ctx.OutputNames(kOutputs);
-    for (const auto var_name : output_variable_names) {
-      PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
-                        platform::errors::InvalidArgument(
-                            "Output variable(%s) not used by cinn", var_name));
-
-      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignExternalVariable(var_name, place, tensor);
-    }
+      // 3.2 Prepare output variables: all output variables should
+      //     be initialized and allocated buffer before
+      //     the runtime program start execution, the compilation result
+      //     includes details of their buffer assginment and we use that to
+      //     allocate space in Paddle. For those variables allocated yet,
+      //     like persistable parameters, just check the equiality between
+      //     Paddle allocation and CINN buffer assginment.
+      auto output_variable_names = ctx.OutputNames(kOutputs);
+      for (const auto var_name : output_variable_names) {
+        PADDLE_ENFORCE_EQ(
+            launch_context->IsVariableUsed(var_name), true,
+            platform::errors::InvalidArgument(
+                "Output variable(%s) not used by cinn", var_name));
+
+        launch_context->AssignExternalVariable(var_name);
+      }

-    // 3.3 Prepare internal or temporary variables: Create a temporary
-    //     scope to keep internal variables within graph or temporary
-    //     variables needed by the compiled runtime program in addition.
-    //     Here we directly use the names from CinnScope as Paddle variable
-    //     names, because they will not be used outside the graph
-    //     and should be destructed after computation finished.
-    auto internal_variable_names = launch_context->GetInternalVariableNames();
-    framework::Scope* temp_scope = scope.NewTmpScope().release();
-    for (const auto& var_name : internal_variable_names) {
-      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignInternalVariable(var_name, place, tensor);
+      // 3.3 Prepare internal or temporary variables: Create a temporary
+      //     scope to keep internal variables within graph or temporary
+      //     variables needed by the compiled runtime program in addition.
+      //     Here we directly use the names from CinnScope as Paddle variable
+      //     names, because they will not be used outside the graph
+      //     and should be destructed after computation finished.
+      auto internal_variable_names = launch_context->GetInternalVariableNames();
+      for (const auto& var_name : internal_variable_names) {
+        launch_context->AssignInternalVariable(var_name);
+      }
    }

    // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
    details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
    VLOG(4) << "CinnLaunchOp launch execution done.";
-
-    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
-    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
-        launch_context->HandoverBuffers()};
-    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
-                                            stream);
  }
 };


--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
    scope.Var(test_out_name)->GetMutable<LoDTensor>();
    scope.Var(expected_out_name)->GetMutable<LoDTensor>();

-    cinn_launch_op->Run(scope, place);
-    elementwise_add_op->Run(scope, place);
+    platform::Place run_place(place);
+    cinn_launch_op->Run(scope, run_place);
+    elementwise_add_op->Run(scope, run_place);

    LoDTensor test_out, expected_out;
    TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),

--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+class AsComplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_complex");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_complex");
+
+    auto in_dims = ctx->GetInputDim("X");
+    const int input_rank = in_dims.size();
+    PADDLE_ENFORCE_GE(
+        input_rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of input(X) is less than 1. "
+            "Expected the rank of input(X) to be equal to or greater than 1."
+            "But received rank of input(X) = %d",
+            input_rank));
+    const int last_dim_size = in_dims[input_rank - 1];
+    PADDLE_ENFORCE_EQ(
+        last_dim_size, 2,
+        platform::errors::InvalidArgument(
+            "The size of the last dimension of input(X)"
+            "does not equals 2."
+            "Expected the size of last dimension of input(X) to be 2."
+            "But received %d",
+            last_dim_size));
+
+    const framework::DDim out_dims(in_dims.Get(), input_rank - 1);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AsComplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of view_as_complex op.");
+    AddOutput("Out", "(Tensor), The output tensor of view_as_complex op.");
+    AddComment(R"DOC(
+As_complex Operator.
+
+This operator is used to return a complex tensor represented
+by an old-fashioned real tensor. The size of the last dimension of 
+the input tensor should be 2, which corresponds to 'real' and 
+'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsComplexGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_real");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+class AsRealOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_real");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_real");
+
+    auto out_dims_v = framework::vectorize(ctx->GetInputDim("X"));
+    out_dims_v.push_back(2);
+    const framework::DDim out_dims = framework::make_ddim(out_dims_v);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(framework::ToRealType(input_data_type),
+                                   ctx.GetPlace());
+  }
+};
+
+class AsRealOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of as_real op.");
+    AddOutput("Out", "(Tensor), The output tensor of as_real op.");
+    AddComment(R"DOC(
+AsReal Operator.
+
+This operator is used to return an old-fashioned real tensor from a 
+complex tensor. The size of the last dimension of the output tensor is 2,
+which corresponds to 'real' and 'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsRealGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_complex");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(as_complex, ops::AsComplexOp, ops::AsComplexOpMaker,
+                  ops::AsComplexGradMaker<paddle::framework::OpDesc>,
+                  ops::AsComplexGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(as_real, ops::AsRealOp, ops::AsRealOpMaker,
+                  ops::AsRealGradMaker<paddle::framework::OpDesc>,
+                  ops::AsRealGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_complex, ops::AsComplexKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/complex_view_op.cu
+++ b/paddle/fluid/operators/complex_view_op.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    as_complex,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/complex_view_op.h
+++ b/paddle/fluid/operators/complex_view_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AsComplexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<platform::complex<T>>(context.GetPlace());
+
+    // TensorCopy also changes output's shape & dtype
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);  // restored the shape
+    out->mutable_data<platform::complex<T>>(
+        context.GetPlace());  // restore the dtype
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AsRealKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);            // restored the shape
+    out->mutable_data<T>(context.GetPlace());  // restore the dtype
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ipu_runtime_op.cc
+++ b/paddle/fluid/operators/ipu_runtime_op.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/ipu_runtime_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IpuRuntimeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FeedList", "FeedList of Graph").AsDuplicable();
+    AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddComment(R"DOC(
+Run graph by PopART runtime.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
+
+REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel<float>,
+                       ops::IpuRuntimeKernel<double>,
+                       ops::IpuRuntimeKernel<int>,
+                       ops::IpuRuntimeKernel<int64_t>,
+                       ops::IpuRuntimeKernel<bool>,
+                       ops::IpuRuntimeKernel<int8_t>,
+                       ops::IpuRuntimeKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/ipu_runtime_op.h
+++ b/paddle/fluid/operators/ipu_runtime_op.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/framework/ipu/ipu_backend.h"
+#include "paddle/fluid/framework/tensor.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class IpuRuntimeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef PADDLE_WITH_IPU
+    auto ipu_backend = framework::ipu::IpuBackend::GetInstance();
+    if (!ipu_backend->DeviceIsAttached()) {
+      const platform::IPUDeviceContext& ipu_ctx =
+          reinterpret_cast<const platform::IPUDeviceContext&>(
+              ctx.device_context());
+      ipu_backend->AttachDevice(ipu_ctx.DeviceId());
+    }
+
+    auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
+    auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
+    auto output_names = ctx.OutputNames("FetchList");
+    VLOG(4) << "IpuRuntime Kernel, begin to run graph";
+    ipu_backend->Run(inputs, outputs, ctx);
+
+    // post-run
+    // resize tensor when tensor.dims() is empty
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto* out = outputs[i];
+      if (out->dims().size() == 0) {
+        auto tensor_dtype = out->type();
+        auto sizeof_dtype = framework::SizeOfType(tensor_dtype);
+        int64_t dim = out->memory_size() / sizeof_dtype;
+        out->Resize({dim});
+        VLOG(10) << "set ipu_runtime_op output: " << output_names[i]
+                 << " dims from () to: "
+                 << "(" << dim << ")";
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Please compile WITH_IPU option to enable ipu_runtime op"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }

+template <>
+void set_constant_with_place<platform::IPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,

--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -54,7 +54,7 @@ class PyLayerOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    auto data_type = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
    return framework::OpKernelType(data_type, ctx.device_context());
  }


--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
      const framework::ExecutionContext &ctx) const override {
    auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
    if (multi_inputs.size() > 0) {
-      return framework::KernelSignature("reshape.mulhost", {"X", "ShapeTensor"},
+      return framework::KernelSignature("reshape_mulhost", {"X", "ShapeTensor"},
                                        {}, {"Out"});
    } else if (ctx.HasInput("Shape")) {
-      return framework::KernelSignature("reshape.host", {"X", "Shape"}, {},
+      return framework::KernelSignature("reshape_host", {"X", "Shape"}, {},
                                        {"Out"});
    } else {
      return framework::KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});

--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace platform {

 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
+paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};

 void CUDAGraph::Reset() {
  if (is_reset_) return;
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
      IsCapturing(), true,
      errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
                               "Graph is capturing."));
+  if (IsThreadLocalCapturing()) {
+    PADDLE_ENFORCE_EQ(IsThisThreadCapturing(), true,
+                      platform::errors::PermissionDenied(
+                          "When capturing CUDA Graph in the thread local mode, "
+                          "you cannot begin segmented capturing in the thread "
+                          "which is not the one that starts the capturing."));
+  }
  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
      capturing_graph_->stream_, capturing_graph_->capture_mode_));
  PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
  capturing_graph_->place_ = place;
  capturing_graph_->stream_ = stream;
  capturing_graph_->capture_mode_ = mode;
+  if (mode == cudaStreamCaptureModeThreadLocal) {
+    capturing_thread_id_ = std::this_thread::get_id();
+    VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: "
+             << capturing_thread_id_;
+  }
  BeginSegmentCapture();
 #endif
 }
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {

 std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
  EndSegmentCapture();
+  capturing_thread_id_ = paddle::none;
  return std::move(capturing_graph_);
 }


--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <thread>
 #include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
@@ -26,6 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/utils/optional.h"

 namespace paddle {
 namespace platform {
@@ -99,6 +101,25 @@ class CUDAGraph {
  // supported during capturing CUDA Graph.
  static bool IsValidCapturing();

+  static bool IsThreadLocalCapturing() {
+#if CUDA_VERSION >= 10010
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
+
 private:
  static CUDAGraphID UniqueID() {
    static std::atomic<CUDAGraphID> id;
@@ -118,6 +139,7 @@ class CUDAGraph {
  bool is_reset_{false};
  std::mutex mtx_;

+  static paddle::optional<std::thread::id> capturing_thread_id_;
  static std::unique_ptr<CUDAGraph> capturing_graph_;
 };


--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }

+#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
+  return *reinterpret_cast<float16 *>(&x);
+}
+
+static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
+  return *reinterpret_cast<__half *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  return CUDAFP16ToPDFP16(
+      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
+}
+#else
 CUDA_ATOMIC_WRAPPER(Add, float16) {
  // concrete packed float16 value may exsits in lower or higher 16bits
  // of the 32bits address.
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
  }
 }
 #endif
+#endif

 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
  float *real = reinterpret_cast<float *>(address);

--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
-# IPU
 IF(WITH_IPU)
+  FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)

--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) {
+  auto new_node = CreateBaseOp(graph, node, type, {GetInputVarNode("X", node)},
+                               node->outputs);
+  return new_node;
+}
+
+Node *relu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_relu");
+}
+
+Node *tanh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_tanh");
+}
+
+Node *log_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_log");
+}
+
+Node *sigmoid_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sigmoid");
+}
+
+Node *sqrt_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sqrt");
+}
+
+Node *gelu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_gelu_v2");
+}
+
+Node *log_softmax_handler(Graph *graph, Node *node) {
+  auto axis = BOOST_GET_CONST(int, node->Op()->GetAttr("axis"));
+  auto new_softmax = CreateSoftmaxOpset11(graph, node, node->inputs, {}, axis);
+  return CreateBaseOp(graph, node, "popart_log", new_softmax->outputs,
+                      node->outputs);
+}
+
+REGISTER_HANDLER(relu, relu_handler);
+REGISTER_HANDLER(tanh, tanh_handler);
+REGISTER_HANDLER(log, log_handler);
+REGISTER_HANDLER(sigmoid, sigmoid_handler);
+REGISTER_HANDLER(sqrt, sqrt_handler);
+REGISTER_HANDLER(gelu, gelu_handler);
+REGISTER_HANDLER(log_softmax, log_softmax_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// This avoids the static initialisation order fiasco,
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
+  static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
+  return symbol_handlers;
+}
+
+bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
+  if (SymbolHandlers().count(symbol) != 0) {
+    LOG(WARNING) << "Trying to register popart handler twice for operator: "
+                 << symbol;
+    return false;
+  }
+  bool new_handler = SymbolHandlers().emplace(symbol, handler).second;
+  return new_handler;
+}
+
+// Return a pointer to a handler if one is registered for this kind of node or
+// an empty std::function otherwise.
+SymbolHandler GetHandler(const std::string &kind) {
+  auto it = SymbolHandlers().find(kind);
+  if (it != SymbolHandlers().end()) {
+    return it->second;
+  }
+  return {};
+}
+
+void ConnectNodes(Node *first_node, Node *next_node) {
+  first_node->outputs.push_back(next_node);
+  next_node->inputs.push_back(first_node);
+}
+
+void DisConnectNodes(Node *first_node, Node *next_node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  rm_by_value(first_node->outputs, next_node);
+  rm_by_value(next_node->inputs, first_node);
+  rm_by_value(first_node->inputs, next_node);
+  rm_by_value(next_node->outputs, first_node);
+}
+
+void ClearNode(Node *node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  for (auto *node_in : node->inputs) {
+    rm_by_value(node_in->outputs, node);
+  }
+  for (auto *node_out : node->outputs) {
+    rm_by_value(node_out->inputs, node);
+  }
+}
+
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override) {
+  if (new_op->HasAttr(attr_name) && !override) {
+    return;
+  }
+  if (op->HasAttr(attr_name)) {
+    VLOG(10) << "Copying attr: " << attr_name << " from " << op->Type()
+             << " to " << new_op->Type();
+    new_op->SetAttr(attr_name, op->GetAttr(attr_name));
+    new_op->Flush();
+  }
+}
+
+const int VarType2OnnxDtype(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::BOOL:
+      return static_cast<int>(ONNXDataType::BOOL);
+    case framework::proto::VarType::INT16:
+      return static_cast<int>(ONNXDataType::INT16);
+    case framework::proto::VarType::INT32:
+      return static_cast<int>(ONNXDataType::INT32);
+    case framework::proto::VarType::INT64:
+      return static_cast<int>(ONNXDataType::INT64);
+    case framework::proto::VarType::FP16:
+      return static_cast<int>(ONNXDataType::FLOAT16);
+    case framework::proto::VarType::FP32:
+      return static_cast<int>(ONNXDataType::FLOAT);
+    case framework::proto::VarType::FP64:
+      return static_cast<int>(ONNXDataType::DOUBLE);
+    case framework::proto::VarType::UINT8:
+      return static_cast<int>(ONNXDataType::UINT8);
+    case framework::proto::VarType::INT8:
+      return static_cast<int>(ONNXDataType::INT8);
+    case framework::proto::VarType::BF16:
+      return static_cast<int>(ONNXDataType::BFLOAT16);
+    case framework::proto::VarType::COMPLEX64:
+      return static_cast<int>(ONNXDataType::COMPLEX64);
+    case framework::proto::VarType::COMPLEX128:
+      return static_cast<int>(ONNXDataType::COMPLEX128);
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
+  }
+}
+
+const std::string VarType2PopStr(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::UINT8:
+      return "UINT8";
+    case framework::proto::VarType::INT8:
+      return "INT8";
+    case framework::proto::VarType::INT16:
+      return "INT16";
+    case framework::proto::VarType::INT32:
+      return "INT32";
+    case framework::proto::VarType::INT64:
+      return "INT64";
+    case framework::proto::VarType::BOOL:
+      return "BOOL";
+    case framework::proto::VarType::FP64:
+      return "DOUBLE";
+    case framework::proto::VarType::FP32:
+      return "FLOAT";
+    case framework::proto::VarType::FP16:
+      return "FLOAT16";
+    default:
+      PADDLE_THROW(
+          paddle::platform::errors::Unavailable("Unsupported data type."));
+  }
+}
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id) {
+  auto var_name = op_node->Op()->Input(input_name).at(id);
+  return GetInputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id) {
+  auto var_name = op_node->Op()->Output(output_name).at(id);
+  return GetOutputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node) {
+  for (auto *var : op_node->inputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node) {
+  for (auto *var : op_node->outputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+const bool is_float_equal(float a, float b, float eps) {
+  return std::fabs(a - b) <= eps;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::OpDesc;
+
+#define REGISTER_HANDLER(name, func) \
+  static bool __UNUSED_##name =      \
+      paddle::platform::ipu::RegisterHandler(#name, func)
+
+using SymbolHandler = std::function<Node *(Graph *, Node *)>;
+
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers();
+
+bool RegisterHandler(const std::string &, const SymbolHandler &);
+
+SymbolHandler GetHandler(const std::string &);
+
+void ConnectNodes(Node *first_node, Node *next_node);
+void DisConnectNodes(Node *first_node, Node *next_node);
+void ClearNode(Node *node);
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override = false);
+
+const int VarType2OnnxDtype(const int type);
+const std::string VarType2PopStr(const int type);
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id = 0);
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id = 0);
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node);
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node);
+
+const bool is_float_equal(float a, float b, float eps = 1e-8);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *elementwise_op_handler(Graph *graph, Node *node,
+                             const std::string &type) {
+  auto *op = node->Op();
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  int64_t x_rank = x_shape.size();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+  int64_t y_rank = y_shape.size();
+
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  if (axis == -1 || axis == x_rank - 1 || x_rank == y_rank) {
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                     node->outputs);
+    return new_node;
+  } else {
+    auto y_new_shape = std::vector<int64_t>(x_rank, 1);
+    for (int i = axis; i < axis + y_rank; ++i) {
+      y_new_shape[i] = y_shape[i - axis];
+    }
+    auto attrs = AttributeMap{
+        {"value", y_new_shape},
+        {"dims", std::vector<int64_t>{x_rank}},
+        {"dtype", ONNXDataType::INT64},
+    };
+    // constant
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    // reshape
+    auto new_node_reshape = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {GetInputVarNode("Y", node), new_node_const->outputs[0]}, {});
+    // elementwise_op
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), new_node_reshape->outputs[0]},
+                     node->outputs);
+    return new_node;
+  }
+}
+
+Node *elementwise_add_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_add");
+}
+
+Node *elementwise_sub_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_sub");
+}
+
+Node *elementwise_div_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_div");
+}
+
+Node *elementwise_mul_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mul");
+}
+
+Node *elementwise_min_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_min");
+}
+
+Node *elementwise_max_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_max");
+}
+
+Node *elementwise_pow_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_pow");
+}
+
+Node *elementwise_mod_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mod");
+}
+
+REGISTER_HANDLER(elementwise_add, elementwise_add_handler);
+REGISTER_HANDLER(elementwise_sub, elementwise_sub_handler);
+REGISTER_HANDLER(elementwise_div, elementwise_div_handler);
+REGISTER_HANDLER(elementwise_mul, elementwise_mul_handler);
+REGISTER_HANDLER(elementwise_min, elementwise_min_handler);
+REGISTER_HANDLER(elementwise_max, elementwise_max_handler);
+REGISTER_HANDLER(elementwise_pow, elementwise_pow_handler);
+REGISTER_HANDLER(elementwise_mod, elementwise_mod_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *equal_handler(Graph *graph, Node *node) {
+  auto new_node = CreateBaseOp(
+      graph, node, "popart_equal",
+      {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs);
+  return new_node;
+}
+
+REGISTER_HANDLER(equal, equal_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *mean_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_reducemean",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)},
+                      {
+                          {"keepdims", int64_t{0}},
+                      });
+}
+
+Node *pow_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
+    return CreateBaseOp(
+        graph, node, "popart_pow",
+        {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
+        node->outputs);
+  } else {
+    // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
+    auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
+    auto attrs =
+        MakeConstAttrMapFromValue<float>(value_, {1}, ONNXDataType::FLOAT);
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
+                                                    new_node_const->outputs[0]},
+                        node->outputs);
+  }
+}
+
+Node *mul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto x_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("x_num_col_dims"));
+  auto y_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("y_num_col_dims"));
+  auto x_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape_ = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  // build the shape for reshape
+  std::vector<int64_t> reshape_shape_{};
+  for (int left = 0; left < x_num_col_dims; left++) {
+    reshape_shape_.push_back(int64_t(x_shape_[left]));
+  }
+  for (int right = y_num_col_dims; right < y_shape_.size(); right++) {
+    reshape_shape_.push_back(int64_t(y_shape_[right]));
+  }
+  auto x_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("X", node)},
+                   {}, {{"axis", int64_t(x_num_col_dims)}});
+  auto y_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("Y", node)},
+                   {}, {{"axis", int64_t(y_num_col_dims)}});
+  auto matmul =
+      CreateBaseOp(graph, node, "popart_matmul",
+                   {x_flatten->outputs[0], y_flatten->outputs[0]}, {}, {});
+
+  auto reshape_const = CreateConst(
+      graph, node, {}, {},
+      {{"value", reshape_shape_},
+       {"dims", std::vector<int64_t>{int64_t(reshape_shape_.size())}},
+       {"dtype", ONNXDataType::INT64}});
+  return CreateBaseOp(graph, node, "popart_reshape",
+                      {matmul->outputs[0], reshape_const->outputs[0]},
+                      node->outputs, {});
+}
+
+Node *matmul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X"));
+  auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y"));
+  auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha"));
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  int x_rank = x_shape.size();
+  std::vector<int64_t> perm;
+  if (x_rank == 1) {
+    perm = std::vector<int64_t>{0};
+  } else if (x_rank == 2) {
+    return CreateGemm(graph, node,
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      node->outputs, transpose_x, transpose_y, alpha);
+  } else if (x_rank == 3) {
+    perm = std::vector<int64_t>{0, 2, 1};
+  } else if (x_rank == 4) {
+    perm = std::vector<int64_t>{0, 1, 3, 2};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op matmul with input rank == %d", x_rank));
+  }
+
+  Node *x_node = GetInputVarNode("X", node);
+  Node *y_node = GetInputVarNode("Y", node);
+  if (transpose_x) {
+    x_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("X", node)}, {}, {{"perm", perm}});
+    x_node = x_node->outputs[0];
+  }
+  if (transpose_y) {
+    y_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("Y", node)}, {}, {{"perm", perm}});
+    y_node = y_node->outputs[0];
+  }
+  if (is_float_equal(alpha, 1.0)) {
+    auto o_node =
+        CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT);
+    auto const_node = CreateConst(graph, node, {}, {}, attr);
+    return CreateBaseOp(graph, node, "popart_mul",
+                        {o_node->outputs[0], const_node->outputs[0]},
+                        node->outputs);
+  } else {
+    return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
+                        node->outputs);
+  }
+}
+
+Node *sum_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_sum", node->inputs, node->outputs);
+}
+
+Node *softmax_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis);
+}
+
+Node *scale_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
+  auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
+  auto bias_after_scale_ =
+      BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
+  auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
+
+  auto new_node_bias_var =
+      CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{bias_}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", ONNXDataType::FLOAT}});
+  new_node_bias_var = new_node_bias_var->outputs[0];
+
+  Node *new_node_scale_var = nullptr;
+  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+    new_node_scale_var = GetInputVarNode("ScaleTensor", node);
+  } else {
+    new_node_scale_var =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{scale_}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::FLOAT}});
+    new_node_scale_var = new_node_scale_var->outputs[0];
+  }
+
+  // convert to float32
+  auto new_node_cast =
+      CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
+                 static_cast<int>(framework::proto::VarType::FP32));
+  Node *result = nullptr;
+  if (bias_after_scale_) {
+    auto new_node_mul =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_cast->outputs[0], new_node_scale_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_mul->outputs[0], new_node_bias_var}, {}, {});
+  } else {
+    auto new_node_add =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_cast->outputs[0], new_node_bias_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_add->outputs[0], new_node_scale_var}, {}, {});
+  }
+  auto result_after_cast =
+      CreateCast(graph, node, result->outputs, node->outputs,
+                 static_cast<int>(data_type_));
+  return result_after_cast;
+}
+
+Node *cross_entropy2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {},
+                             framework::proto::VarType::INT32);
+  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
+  if (label_shape_.size() == 1) {
+    return CreateBaseOp(graph, node, "popart_nllloss",
+                        {GetInputVarNode("X", node), new_cast->outputs[0]},
+                        {GetOutputVarNode("Y", node)},
+                        {
+                            {"ignoreIndex", ignoreIndex},
+                        });
+  } else {
+    std::vector<int64_t> new_shape_{label_shape_[0]};
+    auto const_before_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", new_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_before_loss = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {});
+
+    auto nllloss = CreateBaseOp(
+        graph, node, "popart_nllloss",
+        {GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {},
+        {
+            {"ignoreIndex", ignoreIndex},
+        });
+
+    auto const_after_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", label_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_after_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {nllloss->outputs[0], const_after_loss->outputs[0]},
+                     {GetOutputVarNode("Y", node)}, {});
+    return reshape_after_loss;
+  }
+}
+
+REGISTER_HANDLER(mean, mean_handler);
+REGISTER_HANDLER(pow, pow_handler);
+REGISTER_HANDLER(mul, mul_handler);
+REGISTER_HANDLER(matmul, matmul_handler);
+REGISTER_HANDLER(sum, sum_handler);
+REGISTER_HANDLER(softmax, softmax_handler);
+REGISTER_HANDLER(scale, scale_handler);
+REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *conv2d_handler(Graph *graph, Node *node) {
+  OpDesc *op = node->Op();
+  auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
+  auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
+  auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto pads_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  if (pads_.size() == 2) {
+    pads_.push_back(pads_[0]);
+    pads_.push_back(pads_[1]);
+  }
+  auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
+  auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
+  if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+            GetInputVarNode("Bias", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  } else {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  }
+}
+
+Node *batch_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  std::vector<Node *> inputs;
+  inputs.push_back(GetInputVarNode("X", node));
+  inputs.push_back(GetInputVarNode("Scale", node));
+  inputs.push_back(GetInputVarNode("Bias", node));
+  inputs.push_back(GetInputVarNode("Mean", node));
+  inputs.push_back(GetInputVarNode("Variance", node));
+  int64_t num_outputs = 1;
+  std::vector<Node *> outputs;
+  auto is_test_type = op->GetAttrType("is_test");
+  bool is_test;
+  if (is_test_type == 0) {
+    // int
+    is_test = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+  outputs.push_back(GetOutputVarNode("Y", node));
+  if (!is_test) {
+    outputs.push_back(GetOutputVarNode("MeanOut", node));
+    outputs.push_back(GetOutputVarNode("VarianceOut", node));
+    outputs.push_back(GetOutputVarNode("SavedMean", node));
+    outputs.push_back(GetOutputVarNode("SavedVariance", node));
+    num_outputs = 5;
+  }
+  // outputs.push_back(GetOutputVarNode("ReserveSpace", node));
+  auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum"));
+  auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  // data_layout
+  return CreateBaseOp(graph, node, "popart_batchnormalization", inputs, outputs,
+                      {
+                          {"momentum", momentum},
+                          {"epsilon", epsilon},
+                          {"num_outputs", num_outputs},
+                      });
+}
+
+Node *pool2d_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
+  auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+  if (global_pooling) {
+    if (pooling_type == "max") {
+      return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
+                          node->outputs);
+    } else if (pooling_type == "avg") {
+      return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
+                          node->outputs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn pooling_type: %s", pooling_type));
+    }
+  }
+  if (op->HasAttr("padding_algorithm")) {
+    auto padding_algorithm =
+        BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm"));
+    if (padding_algorithm != "EXPLICIT") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn padding_algorithm: %s", padding_algorithm));
+    }
+  }
+
+  auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+  auto kernel_shape = std::vector<int64_t>{ksize.begin(), ksize.end()};
+  auto ceil_mode_ = BOOST_GET_CONST(bool, op->GetAttr("ceil_mode"));
+  auto ceil_mode = int64_t(ceil_mode_ ? 1 : 0);
+  auto paddings = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  auto pads = std::vector<int64_t>{paddings.begin(), paddings.end()};
+  if (pads.size() == 2) {
+    pads.push_back(paddings[0]);
+    pads.push_back(paddings[1]);
+  }
+  auto strides_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto strides = std::vector<int64_t>{strides_.begin(), strides_.end()};
+  if (pooling_type == "max") {
+    int64_t num_outputs = 1;
+    auto dilations = std::vector<int64_t>{};
+    int64_t storage_order = 0;
+    return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
+                        node->outputs, {
+                                           {"num_outputs", num_outputs},
+                                           {"kernel_shape", kernel_shape},
+                                           {"ceil_mode", ceil_mode},
+                                           {"dilations", dilations},
+                                           {"pads", pads},
+                                           {"storage_order", storage_order},
+                                           {"strides", strides},
+                                       });
+  } else if (pooling_type == "avg") {
+    int64_t count_include_pad = 0;
+    return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
+                        node->outputs,
+                        {
+                            {"kernel_shape", kernel_shape},
+                            {"ceil_mode", ceil_mode},
+                            {"count_include_pad", count_include_pad},
+                            {"pads", pads},
+                            {"strides", strides},
+                        });
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op pool2d with unkonwn pooling_type: %s", pooling_type));
+  }
+}
+
+Node *group_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto groups_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto groups = int64_t{groups_};
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node),
+                                  GetOutputVarNode("Mean", node),
+                                  GetOutputVarNode("Variance", node)};
+  return CreateBaseOp(graph, node, "popart_groupnormalization_v2", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *instance_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node)};
+  return CreateBaseOp(graph, node, "popart_instancenormalization", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *layer_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis"));
+  auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+
+  std::vector<int64_t> norm_shape_{1, 1};
+  for (int i = 0; i < input_shape_.size(); i++) {
+    if (i < begin_norm_axis_) {
+      norm_shape_[0] *= input_shape_[i];
+    } else {
+      norm_shape_[1] *= input_shape_[i];
+    }
+  }
+
+  auto attrs1 = AttributeMap{
+      {"value", norm_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(norm_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape1_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs1);
+  auto new_node_reshape1 = CreateBaseOp(
+      graph, node, "popart_reshape",
+      {GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {});
+
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  int64_t groups_ = 1;
+  auto groupnorm_attrs_ =
+      AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
+  auto out_Y_ = MakeVarNode(graph, node);
+  CreateBaseOp(graph, node, "popart_groupnormalization_v2",
+               {new_node_reshape1->outputs[0], GetInputVarNode("Scale", node),
+                GetInputVarNode("Bias", node)},
+               {out_Y_, GetOutputVarNode("Mean", node),
+                GetOutputVarNode("Variance", node)},
+               groupnorm_attrs_);
+
+  auto attrs2 = AttributeMap{
+      {"value", input_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(input_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape2_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs2);
+  auto new_node_reshape2 = CreateBaseOp(graph, node, "popart_reshape",
+                                        {out_Y_, reshape2_const->outputs[0]},
+                                        {GetOutputVarNode("Y", node)}, {});
+  return new_node_reshape2;
+}
+
+Node *dropout_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto dropout_prob_ = BOOST_GET_CONST(float, op->GetAttr("dropout_prob"));
+  auto dropout_implementation_ =
+      BOOST_GET_CONST(std::string, op->GetAttr("dropout_implementation"));
+  auto is_test_type_ = op->GetAttrType("is_test");
+  bool is_test_;
+  if (is_test_type_ == 0) {
+    // int
+    is_test_ = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test_ = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+
+  if (is_test_) {
+    if (dropout_implementation_ == "upscale_in_train") {
+      return CreateBaseOp(graph, node, "popart_identity",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      auto scale =
+          CreateConst(graph, node, {}, {},
+                      {{"value", std::vector<float>{1 - dropout_prob_}},
+                       {"dims", std::vector<int64_t>{1}},
+                       {"dtype", ONNXDataType::FLOAT}});
+      return CreateBaseOp(graph, node, "popart_mul",
+                          {GetInputVarNode("X", node), scale->outputs[0]},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  } else {
+    if (dropout_implementation_ == "upscale_in_train") {
+      auto attrs_ =
+          AttributeMap{{"num_outputs", (int64_t)1}, {"ratio", dropout_prob_}};
+      return CreateBaseOp(graph, node, "popart_dropout",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, attrs_);
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Do not support downgrade_in_infer with training"));
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  }
+}
+
+REGISTER_HANDLER(pool2d, pool2d_handler);
+REGISTER_HANDLER(batch_norm, batch_norm_handler);
+REGISTER_HANDLER(group_norm, group_norm_handler);
+REGISTER_HANDLER(instance_norm, instance_norm_handler);
+REGISTER_HANDLER(layer_norm, layer_norm_handler);
+REGISTER_HANDLER(conv2d, conv2d_handler);
+REGISTER_HANDLER(dropout, dropout_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// singleton
+static int var_count = 0;
+static int op_count = 0;
+
+const std::string GenerateVarName() {
+  return std::string("_gen_var_") + std::to_string(var_count++);
+}
+
+const std::string GenerateOpName() {
+  return std::string("_gen_op_") + std::to_string(op_count++);
+}
+
+const std::string CreateOpIdentifyId(Node *node) {
+  // format: op_type|out_var0|out_var1|...|_gen_*
+  // this name will be used as op name when exporting onnx model from popart
+  auto op_type = node->Name();
+  std::string op_out = "";
+  for (auto *out_node : node->outputs) {
+    op_out += "|";
+    op_out += out_node->Name();
+  }
+  return {op_type + op_out + "|" + GenerateOpName()};
+}
+
+Node *MakeVarNode(Graph *graph, Node *node) {
+  auto var_name = GenerateVarName();
+  auto var_desc = std::make_unique<framework::VarDesc>(var_name);
+
+  auto var = graph->CreateVarNode(var_desc.get());
+  return var;
+}
+
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs) {
+  auto op_desc = std::make_unique<framework::OpDesc>();
+  op_desc->SetType(type);
+  auto op = graph->CreateOpNode(op_desc.get());
+
+  for (auto *in : inputs) {
+    ConnectNodes(in, op);
+  }
+  if (outputs.empty()) {
+    auto var = MakeVarNode(graph, node);
+    ConnectNodes(op, var);
+  } else {
+    for (auto *out : outputs) {
+      ConnectNodes(op, out);
+    }
+  }
+
+  // i/o
+  std::vector<std::string> input_names;
+  for (auto node : op->inputs) {
+    input_names.push_back(node->Name());
+  }
+  op->Op()->SetInput("__inputs__", input_names);
+  std::vector<std::string> output_names;
+  for (auto node : op->outputs) {
+    output_names.push_back(node->Name());
+  }
+  op->Op()->SetOutput("__outputs__", output_names);
+  op->Op()->Flush();
+
+  return op;
+}
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs) {
+  auto new_node = MakeOpNode(graph, node, type, inputs, outputs);
+  if (!attrs.empty()) {
+    new_node->Op()->SetAttrMap(attrs);
+  }
+  // deal special attr
+  if (!new_node->Op()->HasAttr(sIpuIndexAttr)) {
+    CopyOpAttr(sIpuIndexAttr, node->Op(), new_node->Op());
+  }
+  if (!new_node->Op()->HasAttr(sIpuStageAttr)) {
+    CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op());
+  }
+  {
+    new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
+    new_node->Op()->Flush();
+  }
+
+  return new_node;
+}
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs) {
+  return CreateBaseOp(graph, node, "popart_constant", inputs, outputs, attrs);
+}
+
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype) {
+  auto to = VarType2PopStr(otype);
+  return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
+                      {{"to", to}});
+}
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA,
+                 int64_t transB, float alpha, float beta) {
+  return CreateBaseOp(graph, node, "popart_gemm", inputs, outputs,
+                      {
+                          {"alpha", alpha},
+                          {"beta", beta},
+                          {"transA", transA},
+                          {"transB", transB},
+                      });
+}
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape) {
+  auto attr = AttributeMap{
+      {"value", oshape},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(oshape.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto new_node_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attr);
+  auto new_node_reshape =
+      CreateBaseOp(graph, node, "popart_reshape",
+                   {inputs[0], new_node_const->outputs[0]}, outputs);
+  return new_node_reshape;
+}
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations, int64_t group,
+                 const std::vector<int64_t> &kernel_shape,
+                 const std::vector<int64_t> &pads,
+                 const std::vector<int64_t> &strides) {
+  auto attrs = AttributeMap{
+      {"dilations", dilations},       {"group", group},
+      {"kernel_shape", kernel_shape}, {"pads", pads},
+      {"strides", strides},
+  };
+  return CreateBaseOp(graph, node, "popart_conv", inputs, outputs, attrs);
+}
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis) {
+  PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
+                                          "Softmax op only support one input"));
+  auto x_shape = inputs[0]->Var()->GetShape();
+  int x_rank = x_shape.size();
+  if (axis < 0) {
+    axis = axis + x_rank;
+  }
+  if (axis == x_rank - 1) {
+    return CreateBaseOp(graph, node, "popart_softmax", inputs, outputs,
+                        {{"axis", int64_t{-1}}});
+  } else {
+    auto perm = std::vector<int64_t>(x_rank);
+    std::iota(perm.begin(), perm.end(), 0);
+    perm[x_rank - 1] = axis;
+    perm[axis] = x_rank - 1;
+    auto new_transpose_pre = CreateBaseOp(graph, node, "popart_transpose",
+                                          inputs, {}, {{"perm", perm}});
+    auto new_softmax =
+        CreateBaseOp(graph, node, "popart_softmax", new_transpose_pre->outputs,
+                     {}, {{"axis", int64_t{-1}}});
+    return CreateBaseOp(graph, node, "popart_transpose", new_softmax->outputs,
+                        outputs, {{"perm", perm}});
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using paddle::framework::AttributeMap;
+
+template <typename T>
+AttributeMap MakeConstAttrMap(std::vector<T> value, std::vector<int64_t> dims,
+                              int dtype) {
+  return AttributeMap{{"value", value}, {"dims", dims}, {"dtype", dtype}};
+}
+
+template <typename T>
+AttributeMap MakeConstAttrMapFromValue(T v, std::vector<int64_t> dims,
+                                       int dtype) {
+  size_t size = 1;
+  for (auto &dim : dims) {
+    size *= dim;
+  }
+  return MakeConstAttrMap<T>(std::vector<T>(size, v), dims, dtype);
+}
+
+const std::string GenerateVarName();
+const std::string CreateOpIdentifyId(Node *node);
+
+Node *MakeVarNode(Graph *graph, Node *node);
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs);
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs = {});
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs);
+
+// otype is proto::VarType::Type
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype);
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA = 0,
+                 int64_t transB = 0, float alpha = 1.0f, float beta = 1.0f);
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape);
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations = {1, 1},
+                 int64_t group = 1,
+                 const std::vector<int64_t> &kernel_shape = {},
+                 const std::vector<int64_t> &pads = {0, 0, 0, 0},
+                 const std::vector<int64_t> &strides = {1, 1});
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
 OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
 OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
 OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT
+
+// clang-format on
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#endif
 #include "glog/logging.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
  if (it == device_contexts_.end()) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
-        "your train process set the correct device id if you use Executor.",
+        "with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check "
+        "that your train process set the correct device id if you use "
+        "Executor.",
        place));
  }
  return it->second.get().get();
@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
      PADDLE_THROW(
          platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                          "re-compile with WITH_XPU option."));
+#endif
+    } else if (platform::is_ipu_place(p)) {
+#ifdef PADDLE_WITH_IPU
+      EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("IPUPlace is not supported. Please "
+                                          "re-compile with WITH_IPU option."));
 #endif
    } else if (platform::is_npu_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {

 Place CPUDeviceContext::GetPlace() const { return place_; }

+#ifdef PADDLE_WITH_IPU
+IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
+  int id = place.GetDeviceId();
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  device_ = ipu_backend->GetDevice(id);
+}
+
+Place IPUDeviceContext::GetPlace() const { return place_; }
+void IPUDeviceContext::Wait() const {
+  /*! \brief  Wait for all operations completion in the stream. */
+}
+
+IPUDeviceContext::~IPUDeviceContext() {}
+
+#endif
 #ifdef PADDLE_WITH_XPU
 XPUDeviceContext::XPUDeviceContext() {
  context_ = xpu::create_context();

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
  void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
+  void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; }
  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }

 private:
@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
  return boost::apply_visitor(IsNPUPlace(), p);
 }

+bool is_ipu_place(const Place &p) {
+  return boost::apply_visitor(IsIPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
  return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
    } else if (is_npu_place(p1)) {
      return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
+    } else if (is_ipu_place(p1)) {
+      return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2);
    } else {
      return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
    }

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
  inline bool operator!=(const NPUPinnedPlace &) const { return false; }
  inline bool operator<(const NPUPinnedPlace &) const { return false; }
 };
+struct IPUPlace {
+  IPUPlace() : IPUPlace(0) {}
+  explicit IPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const IPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const IPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const IPUPlace &o) const { return device < o.device; }
+
+  int device;
+};

 struct IsCUDAPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return true; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return true; }
  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return true; }
  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
  bool operator()(const NPUPinnedPlace &) const { return true; }
+};
+struct IsIPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return true; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
 };

 class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace, NPUPinnedPlace> {
+                                    CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> {
 private:
  using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                   CUDAPinnedPlace, NPUPinnedPlace>;
+                                   CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>;

 public:
  Place() = default;
  Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
  Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
+  Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {}     // NOLINT
  Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
  Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
      : PlaceBase(cuda_pinned_place) {}
@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
+bool is_ipu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool is_npu_pinned_place(const Place &);
@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
    return typename Visitor::result_type();
 #endif
  }
+  typename Visitor::result_type operator()(const IPUPlace &ipu) const {
+#ifdef PADDLE_WITH_IPU
+    return visitor_(ipu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with IPU. Cannot visit ipu device"));
+    return typename Visitor::result_type();
+#endif
+  }

  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
 pybind.h
-op_function_impl.h
\ No newline at end of file
+op_function_impl.h
+eager_op_function_impl.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {


--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use XPUPlace in CPU/GPU version, "
        "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_ipu_place(place)) {
+#ifdef PADDLE_WITH_IPU
+    if (zero_copy) {
+      auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
+      auto type = framework::ToDataType(std::type_index(typeid(T)));
+      self->ResetHolderWithType(holder, type);
+    } else {
+      auto dst = self->mutable_data<T>(place);
+      std::memcpy(dst, array.data(), array.nbytes());
+    }
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
+        "Please recompile or reinstall Paddle with IPU support."));
 #endif
  } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)

+if (NOT PYTHON_EXECUTABLE)
+  find_package(PythonInterp REQUIRED)
+endif()
+
 add_custom_command(
  OUTPUT ${api_header_file} ${api_source_file}
  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml

--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/kernel_registry.h"
+
+// TODO(chenweihang) After the kernel is split into a single file,
+// the kernel declare statement is automatically generated according to the
+// file name of the kernel, and this header file will be removed
+
+PT_DECLARE_KERNEL(full_like, CPU);
+PT_DECLARE_KERNEL(dot, CPU);
+PT_DECLARE_KERNEL(flatten, CPU);
+PT_DECLARE_KERNEL(sign, CPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(full_like, CUDA);
+PT_DECLARE_KERNEL(dot, CUDA);
+PT_DECLARE_KERNEL(flatten, CUDA);
+PT_DECLARE_KERNEL(sign, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(flatten, XPU);
+#endif
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -25,10 +25,14 @@ limitations under the License. */
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"

-PT_DECLARE_MODULE(UtilsCPU);
+PT_DECLARE_KERNEL(copy, CPU);

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(UtilsCUDA);
+PT_DECLARE_KERNEL(copy, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU);
 #endif

 namespace paddle {

--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -27,13 +27,13 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
    {"fill_any_like", "full_like"},
    {"fill_constant", "full"},
    {"flatten_contiguous_range", "flatten"},
-    // {"matmul_v2", "matmul"},
+    {"matmul_v2", "matmul"},
    {"reduce_mean", "mean"},
    {"reduce_sum", "sum"},
    {"reshape2", "reshape"},
    // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
    {"flatten", "deprecated"},
-    // {"matmul", "deprecated"},
+    {"matmul", "deprecated"},
    {"mean", "deprecated"},
    {"reshape", "deprecated"},
    {"sum", "deprecated"}};

--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,

 }  // namespace pten

-PT_REGISTER_MODULE(CreationCPU);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                   CPU,
                   ANY,
                   pten::FillAnyLike,
@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
                   bool,
                   paddle::platform::float16) {}

-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                   CPU,
                   ANY,
                   pten::FillConstant,

--- a/paddle/pten/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
--- a/paddle/pten/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/cuda/linalg.cu
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/utils.cc
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
--- a/python/paddle/fluid/tests/unittests/test_complex_view_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
--- a/python/paddle/fluid/tests/unittests/test_gcd.py
+++ b/python/paddle/fluid/tests/unittests/test_gcd.py
--- a/python/paddle/fluid/tests/unittests/test_lcm.py
+++ b/python/paddle/fluid/tests/unittests/test_lcm.py
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py