From b2aee3e3391d692fa6f639bd87f72fd14ea5b3f8 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Mon, 17 Jan 2022 17:09:39 +0800
Subject: [PATCH] [IPU] update ipu_backend p0 (#38854)

* update ipu_backend

* sync with paddle internal

Co-authored-by: Xiaobing Wang <xiaobingw@graphcore.ai>
Co-authored-by: Allen Guo <alleng@graphcore.ai>
Co-authored-by: Zhixin Yao <zhixiny@graphcore.ai>
Co-authored-by: Haicheng Jiang <haichengj@graphcore.ai>
Co-authored-by: Han Zhao <hanzhao@graphcore.ai>

* apply comments 01

* update error messag

* restore ipu_executor and ipu_optimizer

* add clang-format on

Co-authored-by: Xiaobing Wang <xiaobingw@graphcore.ai>
Co-authored-by: Zhixin Yao <zhixiny@graphcore.ai>
Co-authored-by: Haicheng Jiang <haichengj@graphcore.ai>
Co-authored-by: Han Zhao <hanzhao@graphcore.ai>
---
 .../fluid/platform/device/ipu/CMakeLists.txt  |  23 +-
 paddle/fluid/platform/device/ipu/device.cc    |  39 --
 .../fluid/platform/device/ipu/ipu_backend.cc  | 176 ++-----
 .../fluid/platform/device/ipu/ipu_backend.h   |  72 ++-
 .../fluid/platform/device/ipu/ipu_compiler.cc | 477 +++++++++++++-----
 .../fluid/platform/device/ipu/ipu_compiler.h  | 109 ++--
 .../fluid/platform/device/ipu/ipu_device.cc   |  55 ++
 .../device/ipu/{device.h => ipu_device.h}     |  22 +-
 paddle/fluid/platform/device/ipu/ipu_info.cc  |  13 +-
 paddle/fluid/platform/device/ipu/ipu_info.h   |   2 +
 .../device/ipu/{common.h => ipu_names.h}      |   3 +
 .../fluid/platform/device/ipu/ipu_strategy.cc |  21 +-
 .../fluid/platform/device/ipu/ipu_strategy.h  |  68 ++-
 paddle/fluid/platform/device/ipu/ipu_utils.cc |  37 +-
 paddle/fluid/platform/device/ipu/ipu_utils.h  | 159 +++++-
 .../device/ipu/supported_ops_custom.h         |  21 +
 16 files changed, 873 insertions(+), 424 deletions(-)
 delete mode 100644 paddle/fluid/platform/device/ipu/device.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_device.cc
 rename paddle/fluid/platform/device/ipu/{device.h => ipu_device.h} (65%)
 rename paddle/fluid/platform/device/ipu/{common.h => ipu_names.h} (85%)
 create mode 100644 paddle/fluid/platform/device/ipu/supported_ops_custom.h

diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index 9be12cbf6d4..5f711937a80 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,12 +1,19 @@
 IF(WITH_IPU)
   FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
-  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
-  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
-  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
-  cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
-  cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
-  cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
-  cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
-  cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+  list(APPEND PADDLE_IPU_SRC ${POPART_CANONICALIZATION_SRC})
+  set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
+  set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
+  set(IPU_BACKEND_SRC
+    "ipu_device.cc"
+    "ipu_strategy.cc"
+    "ipu_executor.cc"
+    "ipu_compiler.cc"
+    "ipu_backend.cc"
+    "ipu_utils.cc"
+  )
+
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph framework_proto enforce graph_helper timer)
   cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
+  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart)
+  add_dependencies(paddle_ipu ipu_backend)
 ENDIF()
diff --git a/paddle/fluid/platform/device/ipu/device.cc b/paddle/fluid/platform/device/ipu/device.cc
deleted file mode 100644
index 47e6475089d..00000000000
--- a/paddle/fluid/platform/device/ipu/device.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/device/ipu/device.h"
-
-namespace paddle {
-namespace platform {
-namespace ipu {
-
-Device::Device(const popart::DeviceInfo& device_info)
-    : id_(device_info.getId()), is_attached_(device_info.isAttached()) {
-  popart::DeviceType popart_device_type = device_info.getType();
-  switch (popart_device_type) {
-    case popart::DeviceType::IpuModel:
-      device_type_ = DeviceType::IpuModel;
-      break;
-    case popart::DeviceType::Ipu:
-      device_type_ = DeviceType::Ipu;
-      break;
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "popart::DeviceType:Unsupported type %d", popart_device_type));
-  }
-}
-
-}  // namespace ipu
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index cd0f5ae554c..2471e15e09e 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -24,170 +24,92 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-std::shared_ptr<IpuBackend> IpuBackend::instance_ = nullptr;
+IpuBackend* IpuBackend::GetInstance() {
+  static IpuBackend instance;
+  return &instance;
+}
 
 IpuBackend::IpuBackend() {
-  compiler_ = std::make_shared<Compiler>();
+  compiler_ = std::make_unique<Compiler>();
   executor_ = std::make_unique<Executor>();
 }
 
-void IpuBackend::Clear() {
+IpuBackend::~IpuBackend() {
+  compiler_.reset();
   executor_.reset();
-  // detach device
-  if (device_ != nullptr && device_->isAttached()) {
-    device_->detach();
-    device_.reset();
-    device_ = nullptr;
-  }
-}
-
-IpuBackend::~IpuBackend() { Clear(); }
-
-std::shared_ptr<IpuBackend> IpuBackend::GetInstance() {
-  if (!instance_) {
-    instance_.reset(new IpuBackend());
-  }
-  return instance_;
-}
-
-// This api should only call from python, always return a new object
-std::shared_ptr<IpuBackend> IpuBackend::GetNewInstance() {
-  instance_.reset(new IpuBackend());
-  return instance_;
 }
 
-void IpuBackend::Compile(framework::ir::Graph* graph,
+void IpuBackend::Compile(Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
+  compiler_->Prepare();
+  executor_->SetCompilerResources(compiler_->GetResources());
+
   compiler_->InitInputs(graph, feed_list);
+  compiler_->LowerConstants(graph, scope_);
   compiler_->LowerWeights(graph, scope_);
   compiler_->LowerBody(graph);
   compiler_->InitOutputs(fetch_list);
-  executor_->SetWeights(compiler_->GetWeights());
+  if (ipu_strategy_->is_training) {
+    compiler_->LowerOptimier(graph, scope_);
+  }
+  is_compiled_ = true;
+  // when call compile, means a new graph
+  is_prepared_ = false;
   VLOG(10) << "leave IpuBackend::Compile";
 }
 
-void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
-                     const std::vector<framework::Tensor*>& outputs,
+void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
+                     const std::vector<Tensor*>& outputs,
                      const framework::ExecutionContext& ctx) {
   Prepare();
-  auto inputs_id = compiler_->GetInputs();
-  auto outputs_id = compiler_->GetOutputs();
-  executor_->Run(inputs_id, inputs, outputs_id, outputs, ctx);
+  timer_->Start();
+  executor_->Run(inputs, outputs, ctx);
+  timer_->Pause();
+  VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)";
 }
 
 void IpuBackend::Prepare() {
-  if (is_prepared_) {
-    return;
-  } else {
+  if (!is_prepared_) {
+    executor_->Prepare(compiler_->GetModelProto());
+    timer_.reset(new platform::Timer());
     is_prepared_ = true;
   }
-  // convert Model to fp16
-  if (ipu_strategy_->enable_fp16) {
-    compiler_->ConvertProtoToFp16();
-  }
-  auto proto = compiler_->GetModelProto();
-  auto tensors = compiler_->GetTensors();
-  auto outputs = compiler_->GetOutputs();
-  executor_->Prepare(proto, tensors, outputs, device_);
 }
 
-void IpuBackend::SetScope(const framework::Scope& scope) {
+void IpuBackend::Detach() { executor_->Detach(); }
+
+void IpuBackend::Reset() {
+  executor_->Detach();
+  compiler_.reset();
+  executor_.reset();
+}
+
+void IpuBackend::SetScope(const Scope& scope) {
   scope_ = &scope;
   executor_->SetScope(&scope);
 }
 
 void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
   ipu_strategy_ = &strategy;
-  executor_->SetIpuStrategy(strategy);
   compiler_->SetIpuStrategy(strategy);
+  executor_->SetIpuStrategy(strategy);
 }
 
-size_t IpuBackend::GetNumDevices() {
-  // IpuModel
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) return 1;
-  // Real dev
-  size_t num_devices =
-      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
-  PADDLE_ENFORCE_GT(
-      num_devices, 0,
-      platform::errors::Unavailable(
-          "Do not found any IPU devices, please make "
-          "sure Poplar sdk is enabled or enable ENV \"POPLAR_IPUMODEL=1\""));
-  return num_devices;
-}
-
-std::vector<int> IpuBackend::GetDeviceIds() {
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    return {0};
-  }
-  std::vector<int> device_ids;
-  auto devices =
-      popart::DeviceManager::createDeviceManager().enumerateDevices();
-  PADDLE_ENFORCE_GT(
-      devices.size(), 0,
-      platform::errors::Unavailable("Do not found any IPU devices, please make "
-                                    "sure Poplar sdk is enabled."));
-
-  for (auto device : devices) {
-    device_ids.push_back(device->getId());
-  }
-
-  return device_ids;
-}
-
-Device IpuBackend::GetDevice(int id) {
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    std::map<std::string, std::string> deviceOpts{{"numIPUs", "1 "}};
-    device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
-        deviceOpts);
-    Device device(*device_.get());
-    return device;
-  }
-  size_t num_devices = GetNumDevices();
-  if (id < 0 || id >= num_devices) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "device id %d is invalid, number devices is %d", id, num_devices));
-  }
-  std::shared_ptr<popart::DeviceInfo> popart_device_info =
-      popart::DeviceManager::createDeviceManager().getDevice(
-          popart::SyncPattern::Full, id);
-  Device device(*popart_device_info.get());
-  return device;
-}
-
-void IpuBackend::AttachDevice(int id) {
-  // trick here
-  // Compiler ipu is not same as the runtime ipu.
-  VLOG(10) << "comile ipu id = " << id;
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    return;
-  }
-  device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
-      UpperIpuNum());
-  PADDLE_ENFORCE_NOT_NULL(
-      device_, platform::errors::Unavailable("Can't attach IPU, ipu_num = %d.",
-                                             UpperIpuNum()));
+void IpuBackend::SetCustomOps(
+    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
+  compiler_->SetCustomOps(custom_ops);
 }
 
-bool IpuBackend::DeviceIsAttached() { return device_ != nullptr; }
-
-// num_ipus must be pow(2,n);
-int IpuBackend::UpperIpuNum() {
-  PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0,
-                    platform::errors::Unavailable(
-                        "The ipu num get is wrong, please make sure the "
-                        "sharding or pipline parameter is right."));
-  int i = 0;
-  while (std::pow(2, i) < ipu_strategy_->num_ipus) {
-    i++;
+void IpuBackend::SaveMoldeProto(const std::string& path) {
+  if (ipu_strategy_->is_training && is_prepared_) {
+    executor_->SaveModelToHost(path);
+  } else if (is_compiled_) {
+    compiler_->SaveModelProtoNoCheck(path);
+  } else {
+    LOG(WARNING) << "Model is empty";
   }
-  return std::pow(2, i);
 }
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index 769a1b5b52a..122a3e08370 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -14,88 +14,86 @@ limitations under the License. */
 
 #pragma once
 
-#include <cmath>
 #include <popart/devicemanager.hpp>
 #include <popart/names.hpp>
+#include <popart/tensorinfo.hpp>
 
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_executor.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/ipu/device.h"
-#include "paddle/fluid/platform/ipu/ipu_compiler.h"
-#include "paddle/fluid/platform/ipu/ipu_executor.h"
-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+// IpuBackend is the center of paddle-ipu, its function include:
+//   1. Compile paddle model to popart model
+//   2. Run popart model, inference or training
+//   3. Request and release device
+//   4. Other helper function
 class IpuBackend {
-  // IpuBackend is the center of paddle-ipu, its function include:
-  //   1. Compile paddle model to popart model
-  //   2. Run popart model, inference or training
-  //   3. Request and release device
-  //   4. Other helper function
+ public:
+  static IpuBackend *GetInstance();
 
  public:
   IpuBackend();
   ~IpuBackend();
 
-  void Clear();
-
-  // return if exsits, else create and return
-  static std::shared_ptr<IpuBackend> GetInstance();
-
-  // always return a new instance_
-  static std::shared_ptr<IpuBackend> GetNewInstance();
-
   // what compile does include(call compiler_):
   //   1. map paddle-op -> poart op
   //   2. construct popart onnx compute graph
-  void Compile(framework::ir::Graph *graph,
-               const std::vector<std::string> &feed_list,
+  void Compile(Graph *graph, const std::vector<std::string> &feed_list,
                const std::vector<std::string> &fetch_list);
 
   // what run does include:
   //   1. construct forward onnx graph
   //   2. graph-level optimization
   //   3. autodiff
-  void Run(const std::vector<const framework::Tensor *> &inputs,
-           const std::vector<framework::Tensor *> &outputs,
+  void Run(const std::vector<const Tensor *> &inputs,
+           const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
-  Executor &GetExecutor() { return *executor_; }
+  // detach IPU manually
+  void Detach();
+
+  // reset manually
+  // call it before destruct works
+  void Reset();
 
-  void SetScope(const framework::Scope &scope);
-  const framework::Scope *GetScope() { return scope_; }
+  void SetScope(const Scope &scope);
+  const Scope *GetScope() { return scope_; }
   void SetIpuStrategy(const IpuStrategy &strategy);
   const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
+  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);
 
-  // Device
-  size_t GetNumDevices();
-  std::vector<int> GetDeviceIds();
-  Device GetDevice(int id);
-  void AttachDevice(int id);
-  bool DeviceIsAttached();
+  // save compiled model to onnx
+  void SaveMoldeProto(const std::string &path);
 
  private:
-  int UpperIpuNum();
   void Prepare();
 
  private:
-  std::shared_ptr<Compiler> compiler_;
+  std::unique_ptr<Compiler> compiler_;
   std::unique_ptr<Executor> executor_;
-  std::shared_ptr<popart::DeviceInfo> device_;
+  bool is_compiled_ = false;
   bool is_prepared_ = false;
 
   // not own
-  const framework::Scope *scope_ = nullptr;
+  const Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
 
  private:
-  static std::shared_ptr<IpuBackend> instance_;
+  // time record for IpuBackend::Run
+  std::unique_ptr<platform::Timer> timer_;
+
+  DISABLE_COPY_AND_ASSIGN(IpuBackend);
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index 58f784fdbc9..8bedca5c0b8 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -12,17 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
 
+#include <popart/adam.hpp>
+#include <popart/adaptive.hpp>
+#include <popart/optimizer.hpp>
+#include <popart/sgd.hpp>
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+popart::AdamMode AdamModeFromStr(const std::string& str) {
+  if (str == "adam") {
+    return popart::AdamMode::Adam;
+  } else if (str == "adamax") {
+    return popart::AdamMode::AdaMax;
+  } else if (str == "lamb") {
+    return popart::AdamMode::Lamb;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Uknown AdamMode: %s, AdamMode must be one of these values: adam, "
+        "adamax or lamb",
+        str));
+  }
+}
+
+popart::AdaptiveMode AdaptiveModeFromStr(const std::string& str) {
+  if (str == "adadelta") {
+    return popart::AdaptiveMode::AdaDelta;
+  } else if (str == "adagrad") {
+    return popart::AdaptiveMode::AdaGrad;
+  } else if (str == "rmsprop") {
+    return popart::AdaptiveMode::RMSProp;
+  } else if (str == "centered_rmsprop") {
+    return popart::AdaptiveMode::CenteredRMSProp;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Uknown AdaptiveMode: %s, AdaptiveMode must be one of these values: "
+        "adadelta, adagrad, rmsprop or centered_rmsprop",
+        str));
+  }
+}
+
+popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) {
+  if (str == "decay") {
+    return popart::WeightDecayMode::Decay;
+  } else if (str == "l2_regularization") {
+    return popart::WeightDecayMode::L2Regularization;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Uknown WeightDecayMode: %s, WeightDecayMode must be decay or "
+        "l2_regularization",
+        str));
+  }
+}
+
 template <typename T>
-T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) {
+T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
   if (op_desc->HasAttr(attr)) {
     return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
   } else {
@@ -31,8 +80,7 @@ T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) {
 }
 
 template <typename T>
-nonstd::optional<T> GetOptAttrAllowNull(std::string attr,
-                                        framework::OpDesc* op_desc) {
+nonstd::optional<T> GetOptAttrAllowNull(std::string attr, OpDesc* op_desc) {
   if (op_desc->HasAttr(attr)) {
     return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
   } else {
@@ -40,19 +88,36 @@ nonstd::optional<T> GetOptAttrAllowNull(std::string attr,
   }
 }
 
-Compiler::Compiler() {
-  builder_ = popart::Builder::create();
-  RegisterOpFunc();
+template <typename TI, typename TO>
+TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) {
+  if (op_desc->HasAttr(attr)) {
+    auto x = BOOST_GET_CONST(TI, op_desc->GetAttr(attr));
+    return static_cast<TO>(x);
+  } else {
+    return {};
+  }
+}
+
+Compiler::Compiler() { RegisterOpFunc(); }
+
+Compiler::~Compiler() {
+  builder_.reset();
+  resources_.reset();
 }
 
-Compiler::~Compiler() {}
+void Compiler::Prepare() {
+  builder_ = popart::Builder::create();
+  resources_ = std::make_unique<CompilerResources>();
+}
 
 void Compiler::RegisterOpFunc() {
   VLOG(10) << "enter Compiler::RegisterOpFunc";
 #define INT_VEC std::vector<std::int64_t>
+#define INT32_VEC std::vector<std::int32_t>
 #define FLOAT_VEC std::vector<float>
 #define FLOAT float
 #define INT std::int64_t
+#define INT32 std::int32_t
 #define BOOL bool
 #define STRING std::string
 #define STRING_VEC std::vector<std::string*>
@@ -60,6 +125,7 @@ void Compiler::RegisterOpFunc() {
 
 #define ARG(Type, Name) , GetAttrAllowNull<Type>(#Name, op_desc)
 #define OPT_ARG(Type, Name) , GetOptAttrAllowNull<Type>(#Name, op_desc)
+#define SIG_ARG(TI, TO, Name) , GetCastSigAttrAllowNull<TI, TO>(#Name, op_desc)
 #define POPART_CONST_ARG(Name) , const PopartConstant& Name
 #define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name
 #define POPART_ATTRIB_VEC_ARG(Name)
@@ -67,7 +133,7 @@ void Compiler::RegisterOpFunc() {
 
   name_function_ = {
 #define OP_DECL(FuncName, OnnxImpl, Args)                     \
-  {#FuncName, [&](framework::OpDesc* op_desc) {               \
+  {#FuncName, [&](OpDesc* op_desc) {                          \
      auto op_type = op_desc->Type();                          \
      VLOG(10) << "build op:" << op_type << " args " << #Args; \
      auto inputs = GetOpInputs(op_desc);                      \
@@ -77,9 +143,12 @@ void Compiler::RegisterOpFunc() {
      auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
      auto output_ids = OnnxImpl(inputs Args, debug_context);  \
      SetIpuIndexStage(output_ids, op_desc);                   \
+     SetAMPAttributes(output_ids, op_desc);                   \
+     SetSerializeAttributes(output_ids, op_desc);             \
      InsertTensors(output_names, output_ids);                 \
    }},  // NOLINT
-#include "paddle/fluid/platform/ipu/supported_ops_autogen.h"
+#include "paddle/fluid/platform/device/ipu/supported_ops_autogen.h"
+#include "paddle/fluid/platform/device/ipu/supported_ops_custom.h"
   };
 
 #undef OP_DECL
@@ -87,146 +156,99 @@ void Compiler::RegisterOpFunc() {
 #undef POPART_ATTRIB_VEC_ARG
 #undef HOST_SIDE_CONST_ARG
 #undef POPART_CONST_ARG
+#undef SIG_ARG
 #undef OPT_ARG
 #undef ARG
 #undef NONE
 #undef STRING_VEC
 #undef STRING
 #undef BOOL
+#undef INT32
 #undef INT
 #undef FLOAT
 #undef FLOAT_VEC
+#undef INT32_VEC
 #undef INT_VEC
 }
 
-void Compiler::LowerBody(const framework::ir::Graph* graph) {
+void Compiler::LowerBody(const Graph* graph) {
   VLOG(10) << "enter Compiler::LowerBody";
   auto nodes = framework::ir::TopologySortOperations(*graph);
   for (auto* node : nodes) {
     auto* op_desc = node->Op();
     auto op_type = op_desc->Type();
-    VLOG(10) << "node->type: " << op_type;
+    VLOG(10) << "lowering op: " << op_type;
 
     if (op_type == "popart_constant") {
-      auto dims =
-          BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
-      auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
-      auto dtype = OnnxDtype2PopartType(dtype_);
-      popart::TensorInfo tensor_info{dtype, dims};
-      auto value_attr = op_desc->GetAttr("value");
-      auto const_data = std::unique_ptr<popart::ConstVoidData>{};
-      switch (dtype) {
-        case popart::DataType::FLOAT:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<float>, value_attr).data(),
-              tensor_info));
-          break;
-        case popart::DataType::INT32:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<int>, value_attr).data(),
-              tensor_info));
-          break;
-        case popart::DataType::DOUBLE:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<double>, value_attr).data(),
-              tensor_info));
-          break;
-        case popart::DataType::INT64:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<int64_t>, value_attr).data(),
-              tensor_info));
-          break;
-        default:
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "The popart datatype is not supported, popart::DataType is %d",
-              dtype));
-      }
-      popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
-      SetIpuIndexStage(result, op_desc);
-      InsertTensors(GetOpOutputs(op_desc), result);
-    } else if (op_type == "popart_batchnormalization") {
+      // pass
+    } else if (op_type == "popart_optimizer") {
+      // pass
+    } else if (op_type == "popart_checkpointoutput") {
       auto inputs = GetOpInputs(op_desc);
       auto outputs = GetOpOutputs(op_desc);
-      auto num_outputs = outputs.size();
-      auto epsilon = BOOST_GET_CONST(float, op_desc->GetAttr("epsilon"));
-      auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
-      auto result = builder_->aiOnnxOpset11().batchnormalization(
-          inputs, num_outputs, epsilon, momentum);
-      SetIpuIndexStage(result, op_desc);
-      InsertTensors(GetOpOutputs(op_desc), result);
-    } else if (op_type == "popart_nllloss") {
-      auto inputs = GetOpInputs(op_desc);
-      auto ignoreIndex = BOOST_GET_CONST(int, op_desc->GetAttr("ignoreIndex"));
-      auto result = builder_->aiGraphcoreOpset1().nllloss(
-          inputs, popart::ReductionType::NoReduction, ignoreIndex);
-      SetIpuIndexStage(result, op_desc);
-      InsertTensors(GetOpOutputs(op_desc), result);
-    } else if (op_type == "popart_topk") {
+      auto output_ids = builder_->checkpointOutput(inputs);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_custom_op") {
       auto inputs = GetOpInputs(op_desc);
       auto outputs = GetOpOutputs(op_desc);
-      int64_t axis = BOOST_GET_CONST(int64_t, op_desc->GetAttr("axis"));
-      int sorted_INT32 = BOOST_GET_CONST(int, op_desc->GetAttr("sorted"));
-      int64_t sorted = int64_t{sorted_INT32};
-
-      auto aiOnnxOpset = builder_->aiOnnxOpset11();
-
-      popart::ConvInputs result;
-      if (inputs.size() == 2) {
-        VLOG(10)
-            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 2";
-        result = aiOnnxOpset.topk(inputs, axis, sorted);
-      } else if (inputs.size() == 1) {
-        VLOG(10)
-            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 1";
-        int64_t k = BOOST_GET_CONST(int64_t, op_desc->GetAttr("k"));
-        popart::TensorInfo kShape{"INT64", std::vector<int64_t>{1}};
-        popart::ConstVoidData kData = {&k, kShape};
-        auto K_t = aiOnnxOpset.constant(kData);
-        result = aiOnnxOpset.topk({inputs[0], K_t}, axis, sorted);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto attributes = std::map<std::string, popart::any>{};
+      for (auto& attr : op_desc->GetAttrMap()) {
+        CustomOpAttrVisitor visitor(&attributes, attr.first);
+        boost::apply_visitor(visitor, attr.second);
       }
-      result[1] = aiOnnxOpset.cast({result[1]}, "INT32");
-      SetIpuIndexStage(result, op_desc);
-      VLOG(10) << "[Compiler::LowerBody] output[1]: " << outputs[1];
-      VLOG(10) << "[Compiler::LowerBody] output[1]: "
-               << GetOpOutputs(op_desc)[1] << " -> " << result[1];
-      tensors_.emplace(GetOpOutputs(op_desc)[1], result[1]);  // topk indices
-      VLOG(10) << "[Compiler::LowerBody] output[0]: " << outputs[0];
-      VLOG(10) << "[Compiler::LowerBody] output[0]: "
-               << GetOpOutputs(op_desc)[0] << " -> " << result[0];
-      tensors_.emplace(GetOpOutputs(op_desc)[0], result[0]);  // topk values
+      auto __op_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
+      VLOG(10) << "Build graph from custom op: " << __op_type;
+      auto it = custom_ops_.find(__op_type);
+      auto output_ids =
+          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
+                             inputs, outputs.size(), attributes, debug_context);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_printtensor") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto print_gradient =
+          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
+      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
+          inputs, print_gradient, debug_context, title);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
     } else {
       auto itr = name_function_.find(op_type);
       if (itr != name_function_.end()) {
         itr->second(node->Op());
       } else {
         PADDLE_THROW(platform::errors::NotFound(
-            "Op %s is not registered in popart canonicalization", op_type));
+            "%s is not registered, please check for unsupported operators for "
+            "running on IPU",
+            op_type));
       }
     }
   }
   VLOG(10) << "leave Compiler::LowerBody";
 }
 
-void Compiler::InitInputs(framework::ir::Graph* graph,
+void Compiler::InitInputs(Graph* graph,
                           const std::vector<std::string>& feed_list) {
   for (const auto& feed_name : feed_list) {
     feed_list_.push_back(feed_name);
-    for (const framework::ir::Node* n : graph->Nodes()) {
+    for (const Node* n : graph->Nodes()) {
       if (n->IsVar()) {
         auto* var_desc = n->Var();
         if (feed_name == var_desc->Name()) {
           VLOG(10) << "feed_name= " << var_desc->Name();
           auto data_type = VarType2PopartType(var_desc->GetDataType());
-          if (ipu_strategy_->enable_fp16) {
-            data_type = popart::DataType::FLOAT16;
-          }
           popart::TensorInfo input_info{data_type, var_desc->GetShape()};
           VLOG(10) << "popart input_info = " << input_info;
           popart::TensorId tensor_id =
               builder_->addInputTensor(input_info, feed_name);
           VLOG(10) << "popart input tensor id = " << tensor_id;
-          inputs_.push_back(tensor_id);
-          tensors_.emplace(var_desc->Name(), tensor_id);
+          resources_->inputs.push_back(tensor_id);
+          resources_->tensors.emplace(var_desc->Name(), tensor_id);
         }
       }
     }
@@ -236,20 +258,58 @@ void Compiler::InitInputs(framework::ir::Graph* graph,
 void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   for (const auto& fetch_name : fetch_list) {
     fetch_list_.push_back(fetch_name);
-    auto tensor = tensors_.find(fetch_name);
-    PADDLE_ENFORCE_NE(tensor, tensors_.end(),
-                      platform::errors::NotFound(
-                          "output tensor %s does not exist.", fetch_name));
+    auto tensor = resources_->tensors.find(fetch_name);
+    PADDLE_ENFORCE_NE(
+        tensor, resources_->tensors.end(),
+        platform::errors::NotFound(
+            "Output tensor %s is not found, please check the model.",
+            fetch_name));
     VLOG(10) << "fetch_name= " << fetch_name;
     VLOG(10) << "popart output tensor id = " << tensor->second;
     builder_->addOutputTensor(tensor->second);
-    outputs_.push_back(tensor->second);
+    resources_->outputs.push_back(tensor->second);
+  }
+}
+
+void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
+  auto& kid_scope = scope->NewScope();
+  VLOG(10) << "enter Compiler::LowerConstants";
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    if (op_type == "popart_constant") {
+      auto shape =
+          BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
+      auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
+      auto dtype = PopartType2VarType(OnnxDtype2PopartType(dtype_));
+      auto tensor_name = op_desc->Output("__outputs__")[0];
+      auto* var = kid_scope.Var(tensor_name);
+      VLOG(10) << "lowering constant: " << tensor_name;
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      ConstantOpAttrVisitor visitor(tensor, dtype);
+      auto value = op_desc->GetAttr("value");
+      boost::apply_visitor(visitor, value);
+      auto ddim = framework::make_ddim(shape);
+      tensor->Resize(ddim);
+
+      auto const_data = std::unique_ptr<popart::ConstVoidData>();
+      popart::TensorInfo tensor_info(VarType2PopartType(tensor->type()), shape);
+      const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
+      popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      SetIpuIndexStage(result, op_desc);
+      resources_->tensors.emplace(tensor_name, result);
+    }
   }
+  VLOG(10) << "leave Compiler::LowerConstants";
 }
 
-void Compiler::LowerWeights(const framework::ir::Graph* graph,
-                            const framework::Scope* scope_) {
-  PADDLE_ENFORCE_NOT_NULL(scope_,
+void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
+  VLOG(10) << "enter Compiler::LowerWeights";
+  PADDLE_ENFORCE_NOT_NULL(scope,
                           platform::errors::PreconditionNotMet(
                               "You should call set_scope before LowerWeights"));
   // at this step, the graph doesn't contains optimizer related states
@@ -257,12 +317,12 @@ void Compiler::LowerWeights(const framework::ir::Graph* graph,
     if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
       if (node->Var()->Persistable() && node->inputs.empty()) {
         auto var_name = node->Var()->Name();
-        // workround: https://github.com/graphcore/Paddle/issues/151
-        if (tensors_.count(var_name) != 0) {
+        if (resources_->tensors.count(var_name) != 0) {
           continue;
         }
+        VLOG(10) << "lowering weight: " << var_name;
 
-        auto var = scope_->FindVar(var_name);
+        auto var = scope->FindVar(var_name);
         if (var) {
           auto tensor = var->Get<framework::LoDTensor>();
           auto dtype = VarType2PopartType(tensor.type());
@@ -274,12 +334,113 @@ void Compiler::LowerWeights(const framework::ir::Graph* graph,
           popart::ConstVoidData const_data{tensor.data(), tensor_info};
           popart::TensorId result =
               builder_->addInitializedInputTensor(const_data, var_name);
-          tensors_.emplace(var_name, result);
-          weights_.push_back(result);
+          resources_->tensors.emplace(var_name, result);
+          resources_->weights.push_back(result);
         }
       }
     }
   }
+  VLOG(10) << "leave Compiler::LowerWeights";
+}
+
+void Compiler::LowerOptimier(const Graph* graph, const Scope* scope) {
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    if (op_type == "popart_optimizer") {
+      auto raw_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("raw_type"));
+      resources_->optimizer_type = raw_type;
+      auto loss_var =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("loss_var"));
+      resources_->loss_var = resources_->tensors[loss_var];
+      resources_->with_lr_sched =
+          BOOST_GET_CONST(bool, op_desc->GetAttr("with_lr_sched"));
+      if (op_desc->HasAttr("lr_var")) {
+        auto lr_var = BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var"));
+        resources_->lr_var = lr_var;
+        resources_->lr = GetSingleVarFromScope<float>(scope, lr_var);
+      } else {
+        // adadelta has no lr
+        resources_->lr = 0.01f;
+        resources_->with_lr_sched = false;
+      }
+      VLOG(10) << "Set initial lr: " << resources_->lr;
+      auto loss_scaling = ipu_strategy_->loss_scaling;
+      auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type"));
+      if (type == "sgd") {
+        auto weight_decay =
+            BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
+        auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
+        resources_->optimizer_fn = [=](float lr) {
+          return std::make_unique<popart::SGD>(
+              popart::OptimizerValue(lr, false),
+              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(momentum, true),
+              popart::SGD::getUnsetDampening(),
+              popart::SGD::getUnsetVelocityScaling(),
+              popart::OptimizerValue(loss_scaling, true));
+        };
+      } else if (type == "adam") {
+        auto weight_decay =
+            BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
+        auto beta1 = BOOST_GET_CONST(float, op_desc->GetAttr("beta1"));
+        auto beta2 = BOOST_GET_CONST(float, op_desc->GetAttr("beta2"));
+        auto eps = BOOST_GET_CONST(float, op_desc->GetAttr("eps"));
+        auto mwn = ipu_strategy_->max_weight_norm;
+        VLOG(10) << "set max_weight_norm: " << mwn;
+        auto adam_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode"));
+        auto adam_mode = AdamModeFromStr(adam_mode_);
+        auto weight_decay_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
+        resources_->optimizer_fn = [=](float lr) {
+          return std::make_unique<popart::Adam>(
+              popart::OptimizerValue(lr, false),
+              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(beta1, true),
+              popart::OptimizerValue(beta2, true),
+              popart::OptimizerValue(eps, true),
+              popart::OptimizerValue(loss_scaling, true),
+              popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
+              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
+              popart::DataType::FLOAT);
+        };
+      } else if (type == "adaptive") {
+        auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
+        auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
+        auto eps = BOOST_GET_CONST(float, op_desc->GetAttr("eps"));
+        auto weight_decay =
+            BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
+        auto adaptive_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode"));
+        auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_);
+        auto weight_decay_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
+        resources_->optimizer_fn = [=](float lr) {
+          return std::make_unique<popart::Adaptive>(
+              popart::OptimizerValue(lr, false),
+              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(alpha, true),
+              popart::OptimizerValue(momentum, true),
+              popart::OptimizerValue(eps, true),
+              popart::OptimizerValue(loss_scaling, true), adaptive_mode,
+              weight_decay_mode, popart::DataType::UNDEFINED,
+              popart::DataType::FLOAT, popart::DataType::FLOAT,
+              popart::DataType::FLOAT);
+        };
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "optimizer %s is not implemented", type));
+      }
+    }
+  }
 }
 
 void Compiler::InsertTensors(const std::vector<std::string>& output_names,
@@ -288,7 +449,7 @@ void Compiler::InsertTensors(const std::vector<std::string>& output_names,
                     platform::errors::Fatal("InsertTensors size mismatch"));
   for (int i = 0; i < tensor_ids.size(); i++) {
     std::string tensor_id = tensor_ids[i];
-    tensors_.emplace(output_names[i], tensor_ids[i]);
+    resources_->tensors.emplace(output_names[i], tensor_ids[i]);
   }
 }
 
@@ -296,11 +457,11 @@ void Compiler::InsertTensors(const std::vector<std::string>& output_names,
                              const std::string& tensor_id) {
   PADDLE_ENFORCE_EQ(output_names.size(), 1,
                     platform::errors::Fatal("InsertTensors size mismatch"));
-  tensors_.emplace(output_names[0], tensor_id);
+  resources_->tensors.emplace(output_names[0], tensor_id);
 }
 
 void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
-                                const framework::OpDesc* op_desc) {
+                                const OpDesc* op_desc) {
   VLOG(10) << "enter Compiler::SetIpuIndexStage";
   auto tensor_ids_set =
       std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
@@ -321,7 +482,7 @@ void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
 }
 
 void Compiler::SetIpuIndexStage(const std::string& tensor_id,
-                                const framework::OpDesc* op_desc) {
+                                const OpDesc* op_desc) {
   VLOG(10) << "enter Compiler::SetIpuIndexStage";
 
   if (op_desc->HasAttr(sIpuIndexAttr)) {
@@ -339,20 +500,73 @@ void Compiler::SetIpuIndexStage(const std::string& tensor_id,
   VLOG(10) << "leave Compiler::SetIpuIndexStage";
 }
 
-std::vector<popart::TensorId>& Compiler::GetWeights() { return weights_; }
+void Compiler::SetAMPAttributes(const std::vector<std::string>& tensor_ids,
+                                const OpDesc* op_desc) {
+  if (op_desc->Type() == "popart_matmul") {
+    for (const auto& tensor_id : tensor_ids) {
+      SetAMPAttributes(tensor_id, op_desc);
+    }
+  }
+}
+
+void Compiler::SetAMPAttributes(const std::string& tensor_id,
+                                const OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetAMPAttributes";
+  if (op_desc->Type() == "popart_matmul") {
+    auto amp = ipu_strategy_->available_memory_proportion;
+    if (amp > 0.0f && amp <= 1.0) {
+      builder_->setAvailableMemoryProportion(tensor_id, amp);
+    }
+  }
+  VLOG(10) << "leave Compiler::SetAMPAttributes";
+}
+
+void Compiler::SetSerializeAttributes(
+    const std::vector<std::string>& tensor_ids, const OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetSerializeAttributes";
+  auto tensor_ids_set =
+      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
+
+  if (op_desc->Type() == "popart_matmul") {
+    if (op_desc->HasAttr(sMatmulSerializeFactor)) {
+      auto factor =
+          BOOST_GET_CONST(int, op_desc->GetAttr(sMatmulSerializeFactor));
+      std::string mode = "output_channels";
+      if (op_desc->HasAttr(sMatmulSerializeMode)) {
+        mode = BOOST_GET_CONST(std::string,
+                               op_desc->GetAttr(sMatmulSerializeMode));
+      }
+      builder_->setSerializeMatMul(tensor_ids_set, mode, (int64_t)factor, true);
+    }
+  }
+  VLOG(10) << "leave Compiler::SetSerializeAttributes";
+}
+
+void Compiler::SetSerializeAttributes(const std::string& tensor_id,
+                                      const OpDesc* op_desc) {
+  std::vector<std::string> tensor_ids = {tensor_id};
+  SetSerializeAttributes(tensor_ids, op_desc);
+}
 
-// convertFloatsToHalfs
-void Compiler::ConvertProtoToFp16() {
+void Compiler::SetCustomOps(
+    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
+  for (auto x : custom_ops) {
+    custom_ops_.emplace(x.paddle_op, x);
+  }
+}
+
+std::string Compiler::GetFP16ModelProto() {
   popart::GraphTransformer graph_transformer(builder_->getModelProto());
   graph_transformer.convertFloatsToHalfs();
-  converted_proto_ = graph_transformer.getModelProto();
+  return graph_transformer.getModelProto();
 }
 
 std::string Compiler::GetModelProto() {
-  if (converted_proto_.length()) {
-    return converted_proto_;
+  if (ipu_strategy_->enable_fp16) {
+    return GetFP16ModelProto();
+  } else {
+    return builder_->getModelProto();
   }
-  return builder_->getModelProto();
 }
 
 void Compiler::SaveModelProto(const std::string& path) {
@@ -366,12 +580,12 @@ void Compiler::SaveModelProtoNoCheck(const std::string& path) {
   onnxfile.close();
 }
 
-std::vector<std::string> Compiler::GetOpInputs(const framework::OpDesc* op) {
+std::vector<std::string> Compiler::GetOpInputs(const OpDesc* op) {
   auto ins = op->Input("__inputs__");
   std::vector<std::string> inputs;
   for (const auto& in : ins) {
-    if (tensors_.find(in) != tensors_.end()) {
-      inputs.push_back(tensors_[in]);
+    if (resources_->tensors.find(in) != resources_->tensors.end()) {
+      inputs.push_back(resources_->tensors[in]);
     } else {
       inputs.push_back(in);
     }
@@ -379,12 +593,11 @@ std::vector<std::string> Compiler::GetOpInputs(const framework::OpDesc* op) {
   return inputs;
 }
 
-const std::vector<std::string>& Compiler::GetOpOutputs(
-    const framework::OpDesc* op) {
+const std::vector<std::string>& Compiler::GetOpOutputs(const OpDesc* op) {
   return op->Output("__outputs__");
 }
 
-popart::DebugContext Compiler::BuildDebugContext(const framework::OpDesc* op) {
+popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
   auto op_identify_id =
       BOOST_GET_CONST(std::string, op->GetAttr(sOpIdentifyIdAttr));
   VLOG(10) << "op_identify_id of op: " << op->Type() << " is "
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index ecee1595bb8..5576266b1a7 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -16,76 +16,119 @@
 
 #include <popart/builder.hpp>
 #include <popart/graphtransformer.hpp>
+#include <popart/optimizer.hpp>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/ipu/common.h"
-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+struct CompilerResources {
+  // popart input tensor_ids
+  std::vector<popart::TensorId> inputs;
+  // popart output tensor_ids
+  std::vector<popart::TensorId> outputs;
+  // <paddle_var_name, popart_tensor_ids>
+  std::map<std::string, popart::TensorId> tensors;
+  // popart_weight_ids
+  std::vector<popart::TensorId> weights;
+  // popart loss tensor_id
+  popart::TensorId loss_var;
+  // paddle lr var_name
+  std::string lr_var;
+  // lr value
+  float lr;
+  // flag for lr is constant or scheduling
+  bool with_lr_sched = false;
+  // paddle optimizer type, eg: momentum, lamb
+  std::string optimizer_type;
+
+  using OptimizerFn =
+      std::function<std::unique_ptr<popart::Optimizer>(float lr)>;
+  OptimizerFn optimizer_fn;
+
+ public:
+  popart::Optimizer *Optimizer() { return optimizer.get(); }
+
+  popart::Optimizer *NewOptimizer() {
+    optimizer = optimizer_fn(lr);
+    return optimizer.get();
+  }
+
+  popart::Optimizer *UpdateOptimizer(float lr_new) {
+    optimizer = optimizer_fn(lr_new);
+    return optimizer.get();
+  }
+
+ private:
+  std::unique_ptr<popart::Optimizer> optimizer;
+};
+
 class Compiler {
  public:
   Compiler();
   ~Compiler();
+
   void RegisterOpFunc();
-  void LowerBody(const framework::ir::Graph *graph);
-  void InitInputs(framework::ir::Graph *graph,
-                  const std::vector<std::string> &feed_list);
+  void Prepare();
+  void LowerBody(const Graph *graph);
+  void InitInputs(Graph *graph, const std::vector<std::string> &feed_list);
   void InitOutputs(const std::vector<std::string> &fetch_list);
-  void LowerWeights(const framework::ir::Graph *graph,
-                    const framework::Scope *scope_);
+  void LowerConstants(const Graph *graph, const Scope *scope);
+  void LowerWeights(const Graph *graph, const Scope *scope);
+  void LowerOptimier(const Graph *graph, const Scope *scope);
 
   void InsertTensors(const std::vector<std::string> &output_names,
                      const std::vector<std::string> &tensor_ids);
   void InsertTensors(const std::vector<std::string> &output_names,
                      const std::string &tensor_id);
   void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const framework::OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id,
-                        const framework::OpDesc *op_desc);
-
-  std::vector<popart::TensorId> GetInputs() { return inputs_; }
-  std::vector<popart::TensorId> GetOutputs() { return outputs_; }
-  std::map<std::string, popart::TensorId> GetTensors() { return tensors_; }
-  std::vector<popart::TensorId> &GetWeights();
+                        const OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
+                              const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::string &tensor_id,
+                              const OpDesc *op_desc);
 
-  std::string GetModelProto();
   void SetIpuStrategy(const IpuStrategy &strategy) {
     ipu_strategy_ = &strategy;
-  };
+  }
+
+  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);
+
+  CompilerResources *GetResources() { return resources_.get(); }
+
+  std::string GetModelProto();
+  std::string GetFP16ModelProto();
+
   void SaveModelProto(const std::string &path);
   void SaveModelProtoNoCheck(const std::string &path);
-  void ConvertProtoToFp16();
 
  private:
-  std::vector<std::string> GetOpInputs(const framework::OpDesc *op);
-  const std::vector<std::string> &GetOpOutputs(const framework::OpDesc *op);
-  popart::DebugContext BuildDebugContext(const framework::OpDesc *op);
+  std::vector<std::string> GetOpInputs(const OpDesc *op);
+  const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
+  popart::DebugContext BuildDebugContext(const OpDesc *op);
 
  private:
   std::unique_ptr<popart::Builder> builder_;
+  std::unique_ptr<CompilerResources> resources_;
 
-  using OpFunc = std::function<void(framework::OpDesc *op_desc)>;
+  using OpFunc = std::function<void(OpDesc *op_desc)>;
   std::unordered_map<std::string, OpFunc> name_function_;
 
-  // stateful variable
-  std::map<std::string, popart::TensorId> tensors_;
-
   // feed_list_ & fetch_list save paddle tensor id
   std::vector<std::string> feed_list_;
   std::vector<std::string> fetch_list_;
 
-  // inputs_ & outputs_ save popart tensor id
-  std::vector<popart::TensorId> inputs_;
-  std::vector<popart::TensorId> outputs_;
-
-  // weights info map
-  std::vector<popart::TensorId> weights_;
-
-  std::string converted_proto_ = "";
   const IpuStrategy *ipu_strategy_ = nullptr;
+  std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc
new file mode 100644
index 00000000000..cd2a628c9ab
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_device.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+int GetNumDevices() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return 1;
+  }
+  int num_devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
+  PADDLE_ENFORCE_GT(num_devices, 0, platform::errors::Unavailable(
+                                        "Do not found any IPU devices, please "
+                                        "make sure Poplar sdk is enabled"));
+  return num_devices;
+}
+
+std::vector<int> GetDeviceIds() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return {0};
+  }
+  std::vector<int> device_ids;
+  auto devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices();
+  PADDLE_ENFORCE_GT(
+      devices.size(), 0,
+      platform::errors::Unavailable("Do not found any IPU devices, please make "
+                                    "sure Poplar sdk is enabled."));
+  for (auto device : devices) {
+    device_ids.push_back(device->getId());
+  }
+  return device_ids;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/device.h b/paddle/fluid/platform/device/ipu/ipu_device.h
similarity index 65%
rename from paddle/fluid/platform/device/ipu/device.h
rename to paddle/fluid/platform/device/ipu/ipu_device.h
index 24a8bdec308..3da13a522e1 100644
--- a/paddle/fluid/platform/device/ipu/device.h
+++ b/paddle/fluid/platform/device/ipu/ipu_device.h
@@ -21,23 +21,11 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-enum class DeviceType { IpuModel = 0, Cpu, Ipu, OfflineIpu, Sim };
-
-class Device {
- public:
-  Device() {}
-  explicit Device(const popart::DeviceInfo& device_info);
-
-  int getId() const { return id_; }
-  bool isAttached() const { return is_attached_; }
-  DeviceType getType() const { return device_type_; }
-
- private:
-  int id_;
-  bool is_attached_;
-  DeviceType device_type_;
-  /* TODO:: Add more elements in the future */
-};
+// get the number of all avaliable IPUs
+int GetNumDevices();
+
+// get the device id of all avaliable IPUs
+std::vector<int> GetDeviceIds();
 
 }  // namespace ipu
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc
index c184149a9d3..4506bfbf972 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -10,23 +10,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"
 
 namespace paddle {
 namespace platform {
 
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedIPUDevices() {
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  return ipu_backend->GetDeviceIds();
+  return platform::ipu::GetDeviceIds();
 }
 
 //! Get the total number of IPU devices in system.
-int GetIPUDeviceCount() {
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  return ipu_backend->GetNumDevices();
-}
+int GetIPUDeviceCount() { return platform::ipu::GetNumDevices(); }
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.h b/paddle/fluid/platform/device/ipu/ipu_info.h
index 3d032eeb4bf..fe7076e0b50 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.h
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
@@ -17,8 +17,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
+
 std::vector<int> GetSelectedIPUDevices();
 int GetIPUDeviceCount();
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/device/ipu/common.h b/paddle/fluid/platform/device/ipu/ipu_names.h
similarity index 85%
rename from paddle/fluid/platform/device/ipu/common.h
rename to paddle/fluid/platform/device/ipu/ipu_names.h
index 7d62f10abd2..a809a8c6e5b 100644
--- a/paddle/fluid/platform/device/ipu/common.h
+++ b/paddle/fluid/platform/device/ipu/ipu_names.h
@@ -22,6 +22,8 @@ namespace ipu {
 
 static constexpr const char *sIpuIndexAttr = "ipu_index";
 static constexpr const char *sIpuStageAttr = "ipu_stage";
+static constexpr const char *sMatmulSerializeFactor = "serialize_factor";
+static constexpr const char *sMatmulSerializeMode = "serialize_mode";
 static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
 static constexpr const char *sDebugInfoId = "__debug_info_id";
 
@@ -29,6 +31,7 @@ static constexpr const char *sBeta1 = "beta1";
 static constexpr const char *sBeta2 = "beta2";
 static constexpr const char *sBeta1Pow = "Beta1Pow";
 static constexpr const char *sBeta2Pow = "Beta2Pow";
+static constexpr const char *sLossScaling = "LossScaling";
 
 }  // namespace ipu
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 47e7e332c8f..2ddead420d3 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -12,10 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include <glog/logging.h>
 
 namespace paddle {
 namespace platform {
-namespace ipu {}  // namespace ipu
+namespace ipu {
+
+void IpuStrategy::enablePattern(const std::string& t) {
+  VLOG(10) << "enable popart pattern: " << t;
+  popart_patterns.enablePattern(t, true);
+}
+
+void IpuStrategy::disablePattern(const std::string& t) {
+  VLOG(10) << "disable popart pattern: " << t;
+  popart_patterns.enablePattern(t, false);
+}
+
+const bool IpuStrategy::isPatternEnabled(const std::string& t) {
+  return popart_patterns.isPatternEnabled(t);
+}
+
+}  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 7e07d517e10..08f09b96cc0 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -14,24 +14,86 @@ limitations under the License. */
 
 #pragma once
 
+#include <popart/op.hpp>
 #include <popart/sessionoptions.hpp>
+#include <popart/tensorlocation.hpp>
+#include "popart/patterns/patterns.hpp"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
 using VirtualGraphMode = popart::VirtualGraphMode;
+using RecomputationType = popart::RecomputationType;
 
 struct IpuStrategy {
+  IpuStrategy() {
+    // we always save optimizer state to OffChip and enable rts for saving
+    // memory
+    auto storage = popart::TensorLocation(popart::TensorStorage::OffChip,
+                                          popart::ReplicatedTensorSharding::On);
+    popart_options.optimizerStateTensorLocationSettings =
+        popart::TensorLocationSettings(storage);
+
+    // We divide the accumulationFactor and replicatedGraphCount after all
+    // reduce
+    popart_options.accumulationAndReplicationReductionType =
+        popart::ReductionType::Mean;
+    popart_options.meanAccumulationAndReplicationReductionStrategy =
+        popart::MeanReductionStrategy::Post;
+
+    popart_options.enableFloatingPointChecks = false;
+
+    // A directory for log traces to be written into.
+    popart_options.logDir = "popart_log";
+  }
+  ~IpuStrategy() {}
+
+  // Number ipus total needed, replica * ipu_per_replica
   int num_ipus = 1;
+
+  // batches per step
   int batches_per_step = 1;
-  int batch_size = 1;
+
+  // micro batch-size
+  int micro_batch_size = 1;
+
+  // training flag, true for training
   bool is_training = true;
+
+  // save the onnx model lowered by paddle program description
   bool save_init_onnx = false;
-  bool save_last_onnx = true;
-  popart::SessionOptions popart_options_;
+
+  // save the trained model
+  bool save_onnx_checkpoint = false;
+
+  // save paddle model per n steps
+  int save_per_n_step = 1;
+
+  // average sharding, debugging used
   bool need_avg_shard = false;
+
+  // flag for fp16, true for pure fp16
   bool enable_fp16 = false;
+
+  // available memory proportion, 0.0f for disable
+  float available_memory_proportion = 0.0f;
+
+  // loss scaling, currently we can't get loss scaling from
+  // optimizer_extract_pass, so we have to set it here
+  float loss_scaling = 1.0f;
+
+  // defaultMaxWeightNorm for adam optimizer
+  float max_weight_norm = 65504.0f;
+
+  // popart session option
+  popart::SessionOptions popart_options;
+  popart::Patterns popart_patterns;
+
+ public:
+  void enablePattern(const std::string& t);
+  void disablePattern(const std::string& t);
+  const bool isPatternEnabled(const std::string& t);
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc
index 4dfe8c4efbe..6e221fae84e 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -12,22 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+#include <cmath>
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-void* PaddleIArray::data() { return tensor_->data(); }
+void* PaddleIArray::data() { return tensor_.data(); }
 
 popart::DataType PaddleIArray::dataType() const {
-  return VarType2PopartType(tensor_->type());
+  return VarType2PopartType(tensor_.type());
 }
 
-std::size_t PaddleIArray::rank() const { return tensor_->dims().size(); }
+std::size_t PaddleIArray::rank() const { return tensor_.dims().size(); }
 
 int64_t PaddleIArray::dim(size_t index) const {
-  return tensor_->dims().at(index);
+  return tensor_.dims().at(index);
 }
 
 std::size_t PaddleIArray::nelms() const {
@@ -150,6 +151,32 @@ bool GetBoolEnv(std::string str) {
   }
 }
 
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string& opt_type) {
+  // format: {popart_tensor_id, paddle_tensor_id}, ...
+  std::vector<std::pair<std::string, std::string>> pre_post_fix;
+
+  if (opt_type == "adam" || opt_type == "lamb") {
+    pre_post_fix.push_back(std::make_pair("", ""));
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
+    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+  } else if (opt_type == "sgd" || opt_type == "momentum") {
+    // sgd
+    pre_post_fix.push_back(std::make_pair("", ""));
+  } else {
+    pre_post_fix.push_back(std::make_pair("", ""));
+    //
+  }
+
+  return pre_post_fix;
+}
+
+int RequestIpus(const int num_ipus) {
+  // num_ipus must be pow(2, n);
+  return std::pow(2, ceil(log2(num_ipus)));
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
index 3a3b9c8ccc2..3cd7115b5eb 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -17,14 +17,27 @@ limitations under the License. */
 #include <popart/ndarraywrapper.hpp>
 #include <popart/tensordata.hpp>
 #include <popart/tensorinfo.hpp>
+#include <popart/vendored/any.hpp>
 
-#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+using float16 = platform::float16;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using Scope = framework::Scope;
+using OpDesc = framework::OpDesc;
+using Graph = framework::ir::Graph;
+using Node = framework::ir::Node;
+using BlockDesc = framework::BlockDesc;
+
 // onnx dtype
 // https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
 enum ONNXDataType : int {
@@ -49,14 +62,15 @@ enum ONNXDataType : int {
 
 class PaddleIArray final : public popart::IArray {
  public:
-  explicit PaddleIArray(framework::Tensor *tensor) : tensor_(tensor) {
+  explicit PaddleIArray(const Tensor* tensor) {
+    tensor_.ShareDataWith(*tensor);
     for (int i = 0; i < tensor->dims().size(); ++i) {
       shape_.push_back(tensor->dims().at(i));
     }
   }
 
  public:
-  void *data();
+  void* data();
   popart::DataType dataType() const;
   std::size_t rank() const;
   int64_t dim(size_t index) const;
@@ -64,7 +78,7 @@ class PaddleIArray final : public popart::IArray {
   const popart::Shape shape() const;
 
  private:
-  framework::Tensor *tensor_;
+  Tensor tensor_;
   std::vector<int64_t> shape_;
 };
 
@@ -74,8 +88,7 @@ popart::DataType OnnxDtype2PopartType(const int type);
 bool GetBoolEnv(std::string str);
 
 template <typename T>
-std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
-    const framework::Tensor &tensor) {
+std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(const Tensor& tensor) {
   auto dtype = VarType2PopartType(tensor.type());
   auto shape = std::vector<int64_t>();
   for (size_t i = 0; i < tensor.dims().size(); ++i) {
@@ -84,18 +97,140 @@ std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
   popart::TensorInfo tensor_info(dtype, shape);
 
   return std::make_unique<popart::NDArrayWrapper<T>>(
-      reinterpret_cast<T *>(tensor.data()), tensor_info);
+      reinterpret_cast<T*>(tensor.data()), tensor_info);
 }
 
 template <typename T>
 std::unique_ptr<popart::NDArrayWrapper<T>> LoDTensor2IArray(
-    framework::LoDTensor const &lod_tensor) {
-  PADDLE_ENFORCE_EQ(
-      lod_tensor.lod().size(), 0UL,
-      platform::errors::InvalidArgument("LoDTensor2IArray is Unimplemented"));
-  return Tensor2IArray<T>(lod_tensor);
+    LoDTensor const& lod_tensor) {
+  if (lod_tensor.lod().size() == 0) {
+    return Tensor2IArray<T>(lod_tensor);
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("LoDTensor2IArray is Unimplemented"));
+  }
+}
+
+template <typename T>
+T GetSingleVarFromScope(const Scope* scope, const std::string& var_name) {
+  auto var = scope->GetVar(var_name);
+  auto tensor = var->Get<framework::LoDTensor>();
+  // check dtype is  ?
+  return tensor.data<T>()[0];
 }
 
+struct CustomOpAttrVisitor : public boost::static_visitor<void> {
+  explicit CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
+                               const std::string& attr_name)
+      : attrs_(attr), attr_name_(attr_name) {}
+  mutable std::map<std::string, popart::any>* attrs_;
+  std::string attr_name_;
+
+  void operator()(int v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(float v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::string& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<int>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<float>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<std::string>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(bool v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<bool>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(BlockDesc* desc) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type."));
+  }
+  void operator()(const std::vector<BlockDesc*>& v) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type."));
+  }
+  void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<int64_t>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<double>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `boost::blank` type."));
+  }
+};
+
+struct IpuCustomOpIdentifier {
+  IpuCustomOpIdentifier(const std::string& _paddle_op,
+                        const std::string& _popart_op,
+                        const std::string& _domain, unsigned int _version)
+      : paddle_op(_paddle_op), popart_op(_domain, _popart_op, _version) {}
+
+  std::string repr() {
+    std::ostringstream os;
+    os << "paddle_op: " << paddle_op << ", domain: " << popart_op.domain
+       << ", type: " << popart_op.type << ", version: " << popart_op.version;
+    return os.str();
+  }
+
+  std::string paddle_op;
+  popart::OperatorIdentifier popart_op;
+};
+
+struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
+  explicit ConstantOpAttrVisitor(framework::LoDTensor* tensor,
+                                 framework::proto::VarType::Type dtype)
+      : tensor_(tensor), dtype_(dtype) {}
+  framework::LoDTensor* tensor_;
+  framework::proto::VarType::Type dtype_;
+
+  void operator()(const std::vector<int>& vec) const {
+    framework::TensorFromVector<int>(vec, tensor_);
+  }
+  void operator()(const std::vector<float>& vec) const {
+    if (dtype_ == framework::proto::VarType::FP16) {
+      std::vector<float16> vec_fp16;
+      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
+                     [](float f) -> float16 { return float16(f); });
+      framework::TensorFromVector<float16>(vec_fp16, tensor_);
+    } else {
+      framework::TensorFromVector<float>(vec, tensor_);
+    }
+  }
+  void operator()(const std::vector<bool>& vec) const {
+    framework::TensorFromVector<bool>(vec, tensor_);
+  }
+  void operator()(const std::vector<int64_t>& vec) const {
+    framework::TensorFromVector<int64_t>(vec, tensor_);
+  }
+  void operator()(const std::vector<double>& vec) const {
+    framework::TensorFromVector<double>(vec, tensor_);
+  }
+  void RaiseError() const {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Constant value must be a vector"));
+  }
+  void operator()(int v) const { RaiseError(); }
+  void operator()(float v) const { RaiseError(); }
+  void operator()(const std::string& v) const { RaiseError(); }
+  void operator()(const std::vector<std::string>& v) const { RaiseError(); }
+  void operator()(bool v) const { RaiseError(); }
+  void operator()(BlockDesc* desc) const { RaiseError(); }
+  void operator()(const std::vector<BlockDesc*>& v) const { RaiseError(); }
+  void operator()(int64_t v) const { RaiseError(); }
+  void operator()(boost::blank) const { RaiseError(); }
+};
+
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string& opt_type);
+
+int RequestIpus(const int num_ipus);
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_custom.h b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
new file mode 100644
index 00000000000..02d215433c5
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// clang-format off
+
+#pragma once
+
+OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT
+
+// clang-format on
-- 
GitLab