[IPU] update ipu_backend p0 (#38854)

* update ipu_backend * sync with paddle internal Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Allen Guo <alleng@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai> * apply comments 01 * update error messag * restore ipu_executor and ipu_optimizer * add clang-format on Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai>

[IPU] update ipu_backend p0 (#38854)
* update ipu_backend * sync with paddle internal Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Allen Guo <alleng@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai> * apply comments 01 * update error messag * restore ipu_executor and ipu_optimizer * add clang-format on Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai>
b2aee3e3 · Allen Guo · GitHub · b4cb3589 · b2aee3e3 · b2aee3e3
15 changed file
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
 IF(WITH_IPU)
  FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
-  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
-  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
-  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
-  cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
-  cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
-  cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
-  cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
-  cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+  list(APPEND PADDLE_IPU_SRC ${POPART_CANONICALIZATION_SRC})
+  set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
+  set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
+  set(IPU_BACKEND_SRC
+    "ipu_device.cc"
+    "ipu_strategy.cc"
+    "ipu_executor.cc"
+    "ipu_compiler.cc"
+    "ipu_backend.cc"
+    "ipu_utils.cc"
+  )
+
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph framework_proto enforce graph_helper timer)
  cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
+  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart)
+  add_dependencies(paddle_ipu ipu_backend)
 ENDIF()
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"

 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -24,170 +24,92 @@ namespace paddle {
 namespace platform {
 namespace ipu {

-std::shared_ptr<IpuBackend> IpuBackend::instance_ = nullptr;
+IpuBackend* IpuBackend::GetInstance() {
+  static IpuBackend instance;
+  return &instance;
+}

 IpuBackend::IpuBackend() {
-  compiler_ = std::make_shared<Compiler>();
+  compiler_ = std::make_unique<Compiler>();
  executor_ = std::make_unique<Executor>();
 }

-void IpuBackend::Clear() {
+IpuBackend::~IpuBackend() {
+  compiler_.reset();
  executor_.reset();
-  // detach device
-  if (device_ != nullptr && device_->isAttached()) {
-    device_->detach();
-    device_.reset();
-    device_ = nullptr;
-  }
-}
-
-IpuBackend::~IpuBackend() { Clear(); }
-
-std::shared_ptr<IpuBackend> IpuBackend::GetInstance() {
-  if (!instance_) {
-    instance_.reset(new IpuBackend());
-  }
-  return instance_;
-}
-
-// This api should only call from python, always return a new object
-std::shared_ptr<IpuBackend> IpuBackend::GetNewInstance() {
-  instance_.reset(new IpuBackend());
-  return instance_;
 }

-void IpuBackend::Compile(framework::ir::Graph* graph,
+void IpuBackend::Compile(Graph* graph,
                         const std::vector<std::string>& feed_list,
                         const std::vector<std::string>& fetch_list) {
  VLOG(10) << "enter IpuBackend::Compile";
+  compiler_->Prepare();
+  executor_->SetCompilerResources(compiler_->GetResources());
+
  compiler_->InitInputs(graph, feed_list);
+  compiler_->LowerConstants(graph, scope_);
  compiler_->LowerWeights(graph, scope_);
  compiler_->LowerBody(graph);
  compiler_->InitOutputs(fetch_list);
-  executor_->SetWeights(compiler_->GetWeights());
+  if (ipu_strategy_->is_training) {
+    compiler_->LowerOptimier(graph, scope_);
+  }
+  is_compiled_ = true;
+  // when call compile, means a new graph
+  is_prepared_ = false;
  VLOG(10) << "leave IpuBackend::Compile";
 }

-void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
-                     const std::vector<framework::Tensor*>& outputs,
+void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
+                     const std::vector<Tensor*>& outputs,
                     const framework::ExecutionContext& ctx) {
  Prepare();
-  auto inputs_id = compiler_->GetInputs();
-  auto outputs_id = compiler_->GetOutputs();
-  executor_->Run(inputs_id, inputs, outputs_id, outputs, ctx);
+  timer_->Start();
+  executor_->Run(inputs, outputs, ctx);
+  timer_->Pause();
+  VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)";
 }

 void IpuBackend::Prepare() {
-  if (is_prepared_) {
-    return;
-  } else {
+  if (!is_prepared_) {
+    executor_->Prepare(compiler_->GetModelProto());
+    timer_.reset(new platform::Timer());
    is_prepared_ = true;
  }
-  // convert Model to fp16
-  if (ipu_strategy_->enable_fp16) {
-    compiler_->ConvertProtoToFp16();
-  }
-  auto proto = compiler_->GetModelProto();
-  auto tensors = compiler_->GetTensors();
-  auto outputs = compiler_->GetOutputs();
-  executor_->Prepare(proto, tensors, outputs, device_);
 }

-void IpuBackend::SetScope(const framework::Scope& scope) {
+void IpuBackend::Detach() { executor_->Detach(); }
+
+void IpuBackend::Reset() {
+  executor_->Detach();
+  compiler_.reset();
+  executor_.reset();
+}
+
+void IpuBackend::SetScope(const Scope& scope) {
  scope_ = &scope;
  executor_->SetScope(&scope);
 }

 void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
  ipu_strategy_ = &strategy;
-  executor_->SetIpuStrategy(strategy);
  compiler_->SetIpuStrategy(strategy);
+  executor_->SetIpuStrategy(strategy);
 }

-size_t IpuBackend::GetNumDevices() {
-  // IpuModel
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) return 1;
-  // Real dev
-  size_t num_devices =
-      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
-  PADDLE_ENFORCE_GT(
-      num_devices, 0,
-      platform::errors::Unavailable(
-          "Do not found any IPU devices, please make "
-          "sure Poplar sdk is enabled or enable ENV \"POPLAR_IPUMODEL=1\""));
-  return num_devices;
-}
-
-std::vector<int> IpuBackend::GetDeviceIds() {
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    return {0};
-  }
-  std::vector<int> device_ids;
-  auto devices =
-      popart::DeviceManager::createDeviceManager().enumerateDevices();
-  PADDLE_ENFORCE_GT(
-      devices.size(), 0,
-      platform::errors::Unavailable("Do not found any IPU devices, please make "
-                                    "sure Poplar sdk is enabled."));
-
-  for (auto device : devices) {
-    device_ids.push_back(device->getId());
-  }
-
-  return device_ids;
-}
-
-Device IpuBackend::GetDevice(int id) {
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    std::map<std::string, std::string> deviceOpts{{"numIPUs", "1 "}};
-    device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
-        deviceOpts);
-    Device device(*device_.get());
-    return device;
-  }
-  size_t num_devices = GetNumDevices();
-  if (id < 0 || id >= num_devices) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "device id %d is invalid, number devices is %d", id, num_devices));
-  }
-  std::shared_ptr<popart::DeviceInfo> popart_device_info =
-      popart::DeviceManager::createDeviceManager().getDevice(
-          popart::SyncPattern::Full, id);
-  Device device(*popart_device_info.get());
-  return device;
-}
-
-void IpuBackend::AttachDevice(int id) {
-  // trick here
-  // Compiler ipu is not same as the runtime ipu.
-  VLOG(10) << "comile ipu id = " << id;
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    return;
-  }
-  device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
-      UpperIpuNum());
-  PADDLE_ENFORCE_NOT_NULL(
-      device_, platform::errors::Unavailable("Can't attach IPU, ipu_num = %d.",
-                                             UpperIpuNum()));
+void IpuBackend::SetCustomOps(
+    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
+  compiler_->SetCustomOps(custom_ops);
 }

-bool IpuBackend::DeviceIsAttached() { return device_ != nullptr; }
-
-// num_ipus must be pow(2,n);
-int IpuBackend::UpperIpuNum() {
-  PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0,
-                    platform::errors::Unavailable(
-                        "The ipu num get is wrong, please make sure the "
-                        "sharding or pipline parameter is right."));
-  int i = 0;
-  while (std::pow(2, i) < ipu_strategy_->num_ipus) {
-    i++;
+void IpuBackend::SaveMoldeProto(const std::string& path) {
+  if (ipu_strategy_->is_training && is_prepared_) {
+    executor_->SaveModelToHost(path);
+  } else if (is_compiled_) {
+    compiler_->SaveModelProtoNoCheck(path);
+  } else {
+    LOG(WARNING) << "Model is empty";
  }
-  return std::pow(2, i);
 }

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -14,88 +14,86 @@ limitations under the License. */

 #pragma once

-#include <cmath>
 #include <popart/devicemanager.hpp>
 #include <popart/names.hpp>
+#include <popart/tensorinfo.hpp>

-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_executor.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/ipu/device.h"
-#include "paddle/fluid/platform/ipu/ipu_compiler.h"
-#include "paddle/fluid/platform/ipu/ipu_executor.h"
-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/timer.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

+// IpuBackend is the center of paddle-ipu, its function include:
+//   1. Compile paddle model to popart model
+//   2. Run popart model, inference or training
+//   3. Request and release device
+//   4. Other helper function
 class IpuBackend {
-  // IpuBackend is the center of paddle-ipu, its function include:
-  //   1. Compile paddle model to popart model
-  //   2. Run popart model, inference or training
-  //   3. Request and release device
-  //   4. Other helper function
+ public:
+  static IpuBackend *GetInstance();

 public:
  IpuBackend();
  ~IpuBackend();

-  void Clear();
-
-  // return if exsits, else create and return
-  static std::shared_ptr<IpuBackend> GetInstance();
-
-  // always return a new instance_
-  static std::shared_ptr<IpuBackend> GetNewInstance();
-
  // what compile does include(call compiler_):
  //   1. map paddle-op -> poart op
  //   2. construct popart onnx compute graph
-  void Compile(framework::ir::Graph *graph,
-               const std::vector<std::string> &feed_list,
+  void Compile(Graph *graph, const std::vector<std::string> &feed_list,
               const std::vector<std::string> &fetch_list);

  // what run does include:
  //   1. construct forward onnx graph
  //   2. graph-level optimization
  //   3. autodiff
-  void Run(const std::vector<const framework::Tensor *> &inputs,
-           const std::vector<framework::Tensor *> &outputs,
+  void Run(const std::vector<const Tensor *> &inputs,
+           const std::vector<Tensor *> &outputs,
           const framework::ExecutionContext &ctx);

-  Executor &GetExecutor() { return *executor_; }
+  // detach IPU manually
+  void Detach();
+
+  // reset manually
+  // call it before destruct works
+  void Reset();

-  void SetScope(const framework::Scope &scope);
-  const framework::Scope *GetScope() { return scope_; }
+  void SetScope(const Scope &scope);
+  const Scope *GetScope() { return scope_; }
  void SetIpuStrategy(const IpuStrategy &strategy);
  const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
+  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);

-  // Device
-  size_t GetNumDevices();
-  std::vector<int> GetDeviceIds();
-  Device GetDevice(int id);
-  void AttachDevice(int id);
-  bool DeviceIsAttached();
+  // save compiled model to onnx
+  void SaveMoldeProto(const std::string &path);

 private:
-  int UpperIpuNum();
  void Prepare();

 private:
-  std::shared_ptr<Compiler> compiler_;
+  std::unique_ptr<Compiler> compiler_;
  std::unique_ptr<Executor> executor_;
-  std::shared_ptr<popart::DeviceInfo> device_;
+  bool is_compiled_ = false;
  bool is_prepared_ = false;

  // not own
-  const framework::Scope *scope_ = nullptr;
+  const Scope *scope_ = nullptr;
  const IpuStrategy *ipu_strategy_ = nullptr;

 private:
-  static std::shared_ptr<IpuBackend> instance_;
+  // time record for IpuBackend::Run
+  std::unique_ptr<platform::Timer> timer_;
+
+  DISABLE_COPY_AND_ASSIGN(IpuBackend);
 };

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -12,17 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/platform/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"

+#include <popart/adam.hpp>
+#include <popart/adaptive.hpp>
+#include <popart/optimizer.hpp>
+#include <popart/sgd.hpp>
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

+popart::AdamMode AdamModeFromStr(const std::string& str) {
+  if (str == "adam") {
+    return popart::AdamMode::Adam;
+  } else if (str == "adamax") {
+    return popart::AdamMode::AdaMax;
+  } else if (str == "lamb") {
+    return popart::AdamMode::Lamb;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Uknown AdamMode: %s, AdamMode must be one of these values: adam, "
+        "adamax or lamb",
+        str));
+  }
+}
+
+popart::AdaptiveMode AdaptiveModeFromStr(const std::string& str) {
+  if (str == "adadelta") {
+    return popart::AdaptiveMode::AdaDelta;
+  } else if (str == "adagrad") {
+    return popart::AdaptiveMode::AdaGrad;
+  } else if (str == "rmsprop") {
+    return popart::AdaptiveMode::RMSProp;
+  } else if (str == "centered_rmsprop") {
+    return popart::AdaptiveMode::CenteredRMSProp;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Uknown AdaptiveMode: %s, AdaptiveMode must be one of these values: "
+        "adadelta, adagrad, rmsprop or centered_rmsprop",
+        str));
+  }
+}
+
+popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) {
+  if (str == "decay") {
+    return popart::WeightDecayMode::Decay;
+  } else if (str == "l2_regularization") {
+    return popart::WeightDecayMode::L2Regularization;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Uknown WeightDecayMode: %s, WeightDecayMode must be decay or "
+        "l2_regularization",
+        str));
+  }
+}
+
 template <typename T>
-T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) {
+T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
  if (op_desc->HasAttr(attr)) {
    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
  } else {
@@ -31,8 +80,7 @@ T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) {
 }

 template <typename T>
-nonstd::optional<T> GetOptAttrAllowNull(std::string attr,
-                                        framework::OpDesc* op_desc) {
+nonstd::optional<T> GetOptAttrAllowNull(std::string attr, OpDesc* op_desc) {
  if (op_desc->HasAttr(attr)) {
    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
  } else {
@@ -40,19 +88,36 @@ nonstd::optional<T> GetOptAttrAllowNull(std::string attr,
  }
 }

-Compiler::Compiler() {
-  builder_ = popart::Builder::create();
-  RegisterOpFunc();
+template <typename TI, typename TO>
+TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) {
+  if (op_desc->HasAttr(attr)) {
+    auto x = BOOST_GET_CONST(TI, op_desc->GetAttr(attr));
+    return static_cast<TO>(x);
+  } else {
+    return {};
+  }
+}
+
+Compiler::Compiler() { RegisterOpFunc(); }
+
+Compiler::~Compiler() {
+  builder_.reset();
+  resources_.reset();
 }

-Compiler::~Compiler() {}
+void Compiler::Prepare() {
+  builder_ = popart::Builder::create();
+  resources_ = std::make_unique<CompilerResources>();
+}

 void Compiler::RegisterOpFunc() {
  VLOG(10) << "enter Compiler::RegisterOpFunc";
 #define INT_VEC std::vector<std::int64_t>
+#define INT32_VEC std::vector<std::int32_t>
 #define FLOAT_VEC std::vector<float>
 #define FLOAT float
 #define INT std::int64_t
+#define INT32 std::int32_t
 #define BOOL bool
 #define STRING std::string
 #define STRING_VEC std::vector<std::string*>
@@ -60,6 +125,7 @@ void Compiler::RegisterOpFunc() {

 #define ARG(Type, Name) , GetAttrAllowNull<Type>(#Name, op_desc)
 #define OPT_ARG(Type, Name) , GetOptAttrAllowNull<Type>(#Name, op_desc)
+#define SIG_ARG(TI, TO, Name) , GetCastSigAttrAllowNull<TI, TO>(#Name, op_desc)
 #define POPART_CONST_ARG(Name) , const PopartConstant& Name
 #define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name
 #define POPART_ATTRIB_VEC_ARG(Name)
@@ -67,7 +133,7 @@ void Compiler::RegisterOpFunc() {

  name_function_ = {
 #define OP_DECL(FuncName, OnnxImpl, Args)                     \
-  {#FuncName, [&](framework::OpDesc* op_desc) {               \
+  {#FuncName, [&](OpDesc* op_desc) {                          \
     auto op_type = op_desc->Type();                          \
     VLOG(10) << "build op:" << op_type << " args " << #Args; \
     auto inputs = GetOpInputs(op_desc);                      \
@@ -77,9 +143,12 @@ void Compiler::RegisterOpFunc() {
     auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
     auto output_ids = OnnxImpl(inputs Args, debug_context);  \
     SetIpuIndexStage(output_ids, op_desc);                   \
+     SetAMPAttributes(output_ids, op_desc);                   \
+     SetSerializeAttributes(output_ids, op_desc);             \
     InsertTensors(output_names, output_ids);                 \
   }},  // NOLINT
-#include "paddle/fluid/platform/ipu/supported_ops_autogen.h"
+#include "paddle/fluid/platform/device/ipu/supported_ops_autogen.h"
+#include "paddle/fluid/platform/device/ipu/supported_ops_custom.h"
  };

 #undef OP_DECL
@@ -87,146 +156,99 @@ void Compiler::RegisterOpFunc() {
 #undef POPART_ATTRIB_VEC_ARG
 #undef HOST_SIDE_CONST_ARG
 #undef POPART_CONST_ARG
+#undef SIG_ARG
 #undef OPT_ARG
 #undef ARG
 #undef NONE
 #undef STRING_VEC
 #undef STRING
 #undef BOOL
+#undef INT32
 #undef INT
 #undef FLOAT
 #undef FLOAT_VEC
+#undef INT32_VEC
 #undef INT_VEC
 }

-void Compiler::LowerBody(const framework::ir::Graph* graph) {
+void Compiler::LowerBody(const Graph* graph) {
  VLOG(10) << "enter Compiler::LowerBody";
  auto nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* node : nodes) {
    auto* op_desc = node->Op();
    auto op_type = op_desc->Type();
-    VLOG(10) << "node->type: " << op_type;
+    VLOG(10) << "lowering op: " << op_type;

    if (op_type == "popart_constant") {
-      auto dims =
-          BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
-      auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
-      auto dtype = OnnxDtype2PopartType(dtype_);
-      popart::TensorInfo tensor_info{dtype, dims};
-      auto value_attr = op_desc->GetAttr("value");
-      auto const_data = std::unique_ptr<popart::ConstVoidData>{};
-      switch (dtype) {
-        case popart::DataType::FLOAT:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<float>, value_attr).data(),
-              tensor_info));
-          break;
-        case popart::DataType::INT32:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<int>, value_attr).data(),
-              tensor_info));
-          break;
-        case popart::DataType::DOUBLE:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<double>, value_attr).data(),
-              tensor_info));
-          break;
-        case popart::DataType::INT64:
-          const_data.reset(new popart::ConstVoidData(
-              BOOST_GET_CONST(std::vector<int64_t>, value_attr).data(),
-              tensor_info));
-          break;
-        default:
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "The popart datatype is not supported, popart::DataType is %d",
-              dtype));
-      }
-      popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
-      SetIpuIndexStage(result, op_desc);
-      InsertTensors(GetOpOutputs(op_desc), result);
-    } else if (op_type == "popart_batchnormalization") {
+      // pass
+    } else if (op_type == "popart_optimizer") {
+      // pass
+    } else if (op_type == "popart_checkpointoutput") {
      auto inputs = GetOpInputs(op_desc);
      auto outputs = GetOpOutputs(op_desc);
-      auto num_outputs = outputs.size();
-      auto epsilon = BOOST_GET_CONST(float, op_desc->GetAttr("epsilon"));
-      auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
-      auto result = builder_->aiOnnxOpset11().batchnormalization(
-          inputs, num_outputs, epsilon, momentum);
-      SetIpuIndexStage(result, op_desc);
-      InsertTensors(GetOpOutputs(op_desc), result);
-    } else if (op_type == "popart_nllloss") {
-      auto inputs = GetOpInputs(op_desc);
-      auto ignoreIndex = BOOST_GET_CONST(int, op_desc->GetAttr("ignoreIndex"));
-      auto result = builder_->aiGraphcoreOpset1().nllloss(
-          inputs, popart::ReductionType::NoReduction, ignoreIndex);
-      SetIpuIndexStage(result, op_desc);
-      InsertTensors(GetOpOutputs(op_desc), result);
-    } else if (op_type == "popart_topk") {
+      auto output_ids = builder_->checkpointOutput(inputs);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_custom_op") {
      auto inputs = GetOpInputs(op_desc);
      auto outputs = GetOpOutputs(op_desc);
-      int64_t axis = BOOST_GET_CONST(int64_t, op_desc->GetAttr("axis"));
-      int sorted_INT32 = BOOST_GET_CONST(int, op_desc->GetAttr("sorted"));
-      int64_t sorted = int64_t{sorted_INT32};
-
-      auto aiOnnxOpset = builder_->aiOnnxOpset11();
-
-      popart::ConvInputs result;
-      if (inputs.size() == 2) {
-        VLOG(10)
-            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 2";
-        result = aiOnnxOpset.topk(inputs, axis, sorted);
-      } else if (inputs.size() == 1) {
-        VLOG(10)
-            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 1";
-        int64_t k = BOOST_GET_CONST(int64_t, op_desc->GetAttr("k"));
-        popart::TensorInfo kShape{"INT64", std::vector<int64_t>{1}};
-        popart::ConstVoidData kData = {&k, kShape};
-        auto K_t = aiOnnxOpset.constant(kData);
-        result = aiOnnxOpset.topk({inputs[0], K_t}, axis, sorted);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto attributes = std::map<std::string, popart::any>{};
+      for (auto& attr : op_desc->GetAttrMap()) {
+        CustomOpAttrVisitor visitor(&attributes, attr.first);
+        boost::apply_visitor(visitor, attr.second);
      }
-      result[1] = aiOnnxOpset.cast({result[1]}, "INT32");
-      SetIpuIndexStage(result, op_desc);
-      VLOG(10) << "[Compiler::LowerBody] output[1]: " << outputs[1];
-      VLOG(10) << "[Compiler::LowerBody] output[1]: "
-               << GetOpOutputs(op_desc)[1] << " -> " << result[1];
-      tensors_.emplace(GetOpOutputs(op_desc)[1], result[1]);  // topk indices
-      VLOG(10) << "[Compiler::LowerBody] output[0]: " << outputs[0];
-      VLOG(10) << "[Compiler::LowerBody] output[0]: "
-               << GetOpOutputs(op_desc)[0] << " -> " << result[0];
-      tensors_.emplace(GetOpOutputs(op_desc)[0], result[0]);  // topk values
+      auto __op_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
+      VLOG(10) << "Build graph from custom op: " << __op_type;
+      auto it = custom_ops_.find(__op_type);
+      auto output_ids =
+          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
+                             inputs, outputs.size(), attributes, debug_context);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_printtensor") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto print_gradient =
+          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
+      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
+          inputs, print_gradient, debug_context, title);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
    } else {
      auto itr = name_function_.find(op_type);
      if (itr != name_function_.end()) {
        itr->second(node->Op());
      } else {
        PADDLE_THROW(platform::errors::NotFound(
-            "Op %s is not registered in popart canonicalization", op_type));
+            "%s is not registered, please check for unsupported operators for "
+            "running on IPU",
+            op_type));
      }
    }
  }
  VLOG(10) << "leave Compiler::LowerBody";
 }

-void Compiler::InitInputs(framework::ir::Graph* graph,
+void Compiler::InitInputs(Graph* graph,
                          const std::vector<std::string>& feed_list) {
  for (const auto& feed_name : feed_list) {
    feed_list_.push_back(feed_name);
-    for (const framework::ir::Node* n : graph->Nodes()) {
+    for (const Node* n : graph->Nodes()) {
      if (n->IsVar()) {
        auto* var_desc = n->Var();
        if (feed_name == var_desc->Name()) {
          VLOG(10) << "feed_name= " << var_desc->Name();
          auto data_type = VarType2PopartType(var_desc->GetDataType());
-          if (ipu_strategy_->enable_fp16) {
-            data_type = popart::DataType::FLOAT16;
-          }
          popart::TensorInfo input_info{data_type, var_desc->GetShape()};
          VLOG(10) << "popart input_info = " << input_info;
          popart::TensorId tensor_id =
              builder_->addInputTensor(input_info, feed_name);
          VLOG(10) << "popart input tensor id = " << tensor_id;
-          inputs_.push_back(tensor_id);
-          tensors_.emplace(var_desc->Name(), tensor_id);
+          resources_->inputs.push_back(tensor_id);
+          resources_->tensors.emplace(var_desc->Name(), tensor_id);
        }
      }
    }
@@ -236,20 +258,58 @@ void Compiler::InitInputs(framework::ir::Graph* graph,
 void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
  for (const auto& fetch_name : fetch_list) {
    fetch_list_.push_back(fetch_name);
-    auto tensor = tensors_.find(fetch_name);
-    PADDLE_ENFORCE_NE(tensor, tensors_.end(),
-                      platform::errors::NotFound(
-                          "output tensor %s does not exist.", fetch_name));
+    auto tensor = resources_->tensors.find(fetch_name);
+    PADDLE_ENFORCE_NE(
+        tensor, resources_->tensors.end(),
+        platform::errors::NotFound(
+            "Output tensor %s is not found, please check the model.",
+            fetch_name));
    VLOG(10) << "fetch_name= " << fetch_name;
    VLOG(10) << "popart output tensor id = " << tensor->second;
    builder_->addOutputTensor(tensor->second);
-    outputs_.push_back(tensor->second);
+    resources_->outputs.push_back(tensor->second);
+  }
+}
+
+void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
+  auto& kid_scope = scope->NewScope();
+  VLOG(10) << "enter Compiler::LowerConstants";
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    if (op_type == "popart_constant") {
+      auto shape =
+          BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
+      auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
+      auto dtype = PopartType2VarType(OnnxDtype2PopartType(dtype_));
+      auto tensor_name = op_desc->Output("__outputs__")[0];
+      auto* var = kid_scope.Var(tensor_name);
+      VLOG(10) << "lowering constant: " << tensor_name;
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      ConstantOpAttrVisitor visitor(tensor, dtype);
+      auto value = op_desc->GetAttr("value");
+      boost::apply_visitor(visitor, value);
+      auto ddim = framework::make_ddim(shape);
+      tensor->Resize(ddim);
+
+      auto const_data = std::unique_ptr<popart::ConstVoidData>();
+      popart::TensorInfo tensor_info(VarType2PopartType(tensor->type()), shape);
+      const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
+      popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      SetIpuIndexStage(result, op_desc);
+      resources_->tensors.emplace(tensor_name, result);
+    }
  }
+  VLOG(10) << "leave Compiler::LowerConstants";
 }

-void Compiler::LowerWeights(const framework::ir::Graph* graph,
-                            const framework::Scope* scope_) {
-  PADDLE_ENFORCE_NOT_NULL(scope_,
+void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
+  VLOG(10) << "enter Compiler::LowerWeights";
+  PADDLE_ENFORCE_NOT_NULL(scope,
                          platform::errors::PreconditionNotMet(
                              "You should call set_scope before LowerWeights"));
  // at this step, the graph doesn't contains optimizer related states
@@ -257,12 +317,12 @@ void Compiler::LowerWeights(const framework::ir::Graph* graph,
    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
      if (node->Var()->Persistable() && node->inputs.empty()) {
        auto var_name = node->Var()->Name();
-        // workround: https://github.com/graphcore/Paddle/issues/151
-        if (tensors_.count(var_name) != 0) {
+        if (resources_->tensors.count(var_name) != 0) {
          continue;
        }
+        VLOG(10) << "lowering weight: " << var_name;

-        auto var = scope_->FindVar(var_name);
+        auto var = scope->FindVar(var_name);
        if (var) {
          auto tensor = var->Get<framework::LoDTensor>();
          auto dtype = VarType2PopartType(tensor.type());
@@ -274,12 +334,113 @@ void Compiler::LowerWeights(const framework::ir::Graph* graph,
          popart::ConstVoidData const_data{tensor.data(), tensor_info};
          popart::TensorId result =
              builder_->addInitializedInputTensor(const_data, var_name);
-          tensors_.emplace(var_name, result);
-          weights_.push_back(result);
+          resources_->tensors.emplace(var_name, result);
+          resources_->weights.push_back(result);
        }
      }
    }
  }
+  VLOG(10) << "leave Compiler::LowerWeights";
+}
+
+void Compiler::LowerOptimier(const Graph* graph, const Scope* scope) {
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    if (op_type == "popart_optimizer") {
+      auto raw_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("raw_type"));
+      resources_->optimizer_type = raw_type;
+      auto loss_var =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("loss_var"));
+      resources_->loss_var = resources_->tensors[loss_var];
+      resources_->with_lr_sched =
+          BOOST_GET_CONST(bool, op_desc->GetAttr("with_lr_sched"));
+      if (op_desc->HasAttr("lr_var")) {
+        auto lr_var = BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var"));
+        resources_->lr_var = lr_var;
+        resources_->lr = GetSingleVarFromScope<float>(scope, lr_var);
+      } else {
+        // adadelta has no lr
+        resources_->lr = 0.01f;
+        resources_->with_lr_sched = false;
+      }
+      VLOG(10) << "Set initial lr: " << resources_->lr;
+      auto loss_scaling = ipu_strategy_->loss_scaling;
+      auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type"));
+      if (type == "sgd") {
+        auto weight_decay =
+            BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
+        auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
+        resources_->optimizer_fn = [=](float lr) {
+          return std::make_unique<popart::SGD>(
+              popart::OptimizerValue(lr, false),
+              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(momentum, true),
+              popart::SGD::getUnsetDampening(),
+              popart::SGD::getUnsetVelocityScaling(),
+              popart::OptimizerValue(loss_scaling, true));
+        };
+      } else if (type == "adam") {
+        auto weight_decay =
+            BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
+        auto beta1 = BOOST_GET_CONST(float, op_desc->GetAttr("beta1"));
+        auto beta2 = BOOST_GET_CONST(float, op_desc->GetAttr("beta2"));
+        auto eps = BOOST_GET_CONST(float, op_desc->GetAttr("eps"));
+        auto mwn = ipu_strategy_->max_weight_norm;
+        VLOG(10) << "set max_weight_norm: " << mwn;
+        auto adam_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode"));
+        auto adam_mode = AdamModeFromStr(adam_mode_);
+        auto weight_decay_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
+        resources_->optimizer_fn = [=](float lr) {
+          return std::make_unique<popart::Adam>(
+              popart::OptimizerValue(lr, false),
+              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(beta1, true),
+              popart::OptimizerValue(beta2, true),
+              popart::OptimizerValue(eps, true),
+              popart::OptimizerValue(loss_scaling, true),
+              popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
+              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
+              popart::DataType::FLOAT);
+        };
+      } else if (type == "adaptive") {
+        auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
+        auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
+        auto eps = BOOST_GET_CONST(float, op_desc->GetAttr("eps"));
+        auto weight_decay =
+            BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
+        auto adaptive_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode"));
+        auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_);
+        auto weight_decay_mode_ =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
+        resources_->optimizer_fn = [=](float lr) {
+          return std::make_unique<popart::Adaptive>(
+              popart::OptimizerValue(lr, false),
+              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(alpha, true),
+              popart::OptimizerValue(momentum, true),
+              popart::OptimizerValue(eps, true),
+              popart::OptimizerValue(loss_scaling, true), adaptive_mode,
+              weight_decay_mode, popart::DataType::UNDEFINED,
+              popart::DataType::FLOAT, popart::DataType::FLOAT,
+              popart::DataType::FLOAT);
+        };
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "optimizer %s is not implemented", type));
+      }
+    }
+  }
 }

 void Compiler::InsertTensors(const std::vector<std::string>& output_names,
@@ -288,7 +449,7 @@ void Compiler::InsertTensors(const std::vector<std::string>& output_names,
                    platform::errors::Fatal("InsertTensors size mismatch"));
  for (int i = 0; i < tensor_ids.size(); i++) {
    std::string tensor_id = tensor_ids[i];
-    tensors_.emplace(output_names[i], tensor_ids[i]);
+    resources_->tensors.emplace(output_names[i], tensor_ids[i]);
  }
 }

@@ -296,11 +457,11 @@ void Compiler::InsertTensors(const std::vector<std::string>& output_names,
                             const std::string& tensor_id) {
  PADDLE_ENFORCE_EQ(output_names.size(), 1,
                    platform::errors::Fatal("InsertTensors size mismatch"));
-  tensors_.emplace(output_names[0], tensor_id);
+  resources_->tensors.emplace(output_names[0], tensor_id);
 }

 void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
-                                const framework::OpDesc* op_desc) {
+                                const OpDesc* op_desc) {
  VLOG(10) << "enter Compiler::SetIpuIndexStage";
  auto tensor_ids_set =
      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
@@ -321,7 +482,7 @@ void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
 }

 void Compiler::SetIpuIndexStage(const std::string& tensor_id,
-                                const framework::OpDesc* op_desc) {
+                                const OpDesc* op_desc) {
  VLOG(10) << "enter Compiler::SetIpuIndexStage";

  if (op_desc->HasAttr(sIpuIndexAttr)) {
@@ -339,20 +500,73 @@ void Compiler::SetIpuIndexStage(const std::string& tensor_id,
  VLOG(10) << "leave Compiler::SetIpuIndexStage";
 }

-std::vector<popart::TensorId>& Compiler::GetWeights() { return weights_; }
+void Compiler::SetAMPAttributes(const std::vector<std::string>& tensor_ids,
+                                const OpDesc* op_desc) {
+  if (op_desc->Type() == "popart_matmul") {
+    for (const auto& tensor_id : tensor_ids) {
+      SetAMPAttributes(tensor_id, op_desc);
+    }
+  }
+}
+
+void Compiler::SetAMPAttributes(const std::string& tensor_id,
+                                const OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetAMPAttributes";
+  if (op_desc->Type() == "popart_matmul") {
+    auto amp = ipu_strategy_->available_memory_proportion;
+    if (amp > 0.0f && amp <= 1.0) {
+      builder_->setAvailableMemoryProportion(tensor_id, amp);
+    }
+  }
+  VLOG(10) << "leave Compiler::SetAMPAttributes";
+}
+
+void Compiler::SetSerializeAttributes(
+    const std::vector<std::string>& tensor_ids, const OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetSerializeAttributes";
+  auto tensor_ids_set =
+      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
+
+  if (op_desc->Type() == "popart_matmul") {
+    if (op_desc->HasAttr(sMatmulSerializeFactor)) {
+      auto factor =
+          BOOST_GET_CONST(int, op_desc->GetAttr(sMatmulSerializeFactor));
+      std::string mode = "output_channels";
+      if (op_desc->HasAttr(sMatmulSerializeMode)) {
+        mode = BOOST_GET_CONST(std::string,
+                               op_desc->GetAttr(sMatmulSerializeMode));
+      }
+      builder_->setSerializeMatMul(tensor_ids_set, mode, (int64_t)factor, true);
+    }
+  }
+  VLOG(10) << "leave Compiler::SetSerializeAttributes";
+}
+
+void Compiler::SetSerializeAttributes(const std::string& tensor_id,
+                                      const OpDesc* op_desc) {
+  std::vector<std::string> tensor_ids = {tensor_id};
+  SetSerializeAttributes(tensor_ids, op_desc);
+}

-// convertFloatsToHalfs
-void Compiler::ConvertProtoToFp16() {
+void Compiler::SetCustomOps(
+    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
+  for (auto x : custom_ops) {
+    custom_ops_.emplace(x.paddle_op, x);
+  }
+}
+
+std::string Compiler::GetFP16ModelProto() {
  popart::GraphTransformer graph_transformer(builder_->getModelProto());
  graph_transformer.convertFloatsToHalfs();
-  converted_proto_ = graph_transformer.getModelProto();
+  return graph_transformer.getModelProto();
 }

 std::string Compiler::GetModelProto() {
-  if (converted_proto_.length()) {
-    return converted_proto_;
+  if (ipu_strategy_->enable_fp16) {
+    return GetFP16ModelProto();
+  } else {
+    return builder_->getModelProto();
  }
-  return builder_->getModelProto();
 }

 void Compiler::SaveModelProto(const std::string& path) {
@@ -366,12 +580,12 @@ void Compiler::SaveModelProtoNoCheck(const std::string& path) {
  onnxfile.close();
 }

-std::vector<std::string> Compiler::GetOpInputs(const framework::OpDesc* op) {
+std::vector<std::string> Compiler::GetOpInputs(const OpDesc* op) {
  auto ins = op->Input("__inputs__");
  std::vector<std::string> inputs;
  for (const auto& in : ins) {
-    if (tensors_.find(in) != tensors_.end()) {
-      inputs.push_back(tensors_[in]);
+    if (resources_->tensors.find(in) != resources_->tensors.end()) {
+      inputs.push_back(resources_->tensors[in]);
    } else {
      inputs.push_back(in);
    }
@@ -379,12 +593,11 @@ std::vector<std::string> Compiler::GetOpInputs(const framework::OpDesc* op) {
  return inputs;
 }

-const std::vector<std::string>& Compiler::GetOpOutputs(
-    const framework::OpDesc* op) {
+const std::vector<std::string>& Compiler::GetOpOutputs(const OpDesc* op) {
  return op->Output("__outputs__");
 }

-popart::DebugContext Compiler::BuildDebugContext(const framework::OpDesc* op) {
+popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
  auto op_identify_id =
      BOOST_GET_CONST(std::string, op->GetAttr(sOpIdentifyIdAttr));
  VLOG(10) << "op_identify_id of op: " << op->Type() << " is "

--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -16,76 +16,119 @@

 #include <popart/builder.hpp>
 #include <popart/graphtransformer.hpp>
+#include <popart/optimizer.hpp>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/ipu/common.h"
-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

+struct CompilerResources {
+  // popart input tensor_ids
+  std::vector<popart::TensorId> inputs;
+  // popart output tensor_ids
+  std::vector<popart::TensorId> outputs;
+  // <paddle_var_name, popart_tensor_ids>
+  std::map<std::string, popart::TensorId> tensors;
+  // popart_weight_ids
+  std::vector<popart::TensorId> weights;
+  // popart loss tensor_id
+  popart::TensorId loss_var;
+  // paddle lr var_name
+  std::string lr_var;
+  // lr value
+  float lr;
+  // flag for lr is constant or scheduling
+  bool with_lr_sched = false;
+  // paddle optimizer type, eg: momentum, lamb
+  std::string optimizer_type;
+
+  using OptimizerFn =
+      std::function<std::unique_ptr<popart::Optimizer>(float lr)>;
+  OptimizerFn optimizer_fn;
+
+ public:
+  popart::Optimizer *Optimizer() { return optimizer.get(); }
+
+  popart::Optimizer *NewOptimizer() {
+    optimizer = optimizer_fn(lr);
+    return optimizer.get();
+  }
+
+  popart::Optimizer *UpdateOptimizer(float lr_new) {
+    optimizer = optimizer_fn(lr_new);
+    return optimizer.get();
+  }
+
+ private:
+  std::unique_ptr<popart::Optimizer> optimizer;
+};
+
 class Compiler {
 public:
  Compiler();
  ~Compiler();
+
  void RegisterOpFunc();
-  void LowerBody(const framework::ir::Graph *graph);
-  void InitInputs(framework::ir::Graph *graph,
-                  const std::vector<std::string> &feed_list);
+  void Prepare();
+  void LowerBody(const Graph *graph);
+  void InitInputs(Graph *graph, const std::vector<std::string> &feed_list);
  void InitOutputs(const std::vector<std::string> &fetch_list);
-  void LowerWeights(const framework::ir::Graph *graph,
-                    const framework::Scope *scope_);
+  void LowerConstants(const Graph *graph, const Scope *scope);
+  void LowerWeights(const Graph *graph, const Scope *scope);
+  void LowerOptimier(const Graph *graph, const Scope *scope);

  void InsertTensors(const std::vector<std::string> &output_names,
                     const std::vector<std::string> &tensor_ids);
  void InsertTensors(const std::vector<std::string> &output_names,
                     const std::string &tensor_id);
  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const framework::OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id,
-                        const framework::OpDesc *op_desc);
-
-  std::vector<popart::TensorId> GetInputs() { return inputs_; }
-  std::vector<popart::TensorId> GetOutputs() { return outputs_; }
-  std::map<std::string, popart::TensorId> GetTensors() { return tensors_; }
-  std::vector<popart::TensorId> &GetWeights();
+                        const OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
+                              const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::string &tensor_id,
+                              const OpDesc *op_desc);

-  std::string GetModelProto();
  void SetIpuStrategy(const IpuStrategy &strategy) {
    ipu_strategy_ = &strategy;
-  };
+  }
+
+  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);
+
+  CompilerResources *GetResources() { return resources_.get(); }
+
+  std::string GetModelProto();
+  std::string GetFP16ModelProto();
+
  void SaveModelProto(const std::string &path);
  void SaveModelProtoNoCheck(const std::string &path);
-  void ConvertProtoToFp16();

 private:
-  std::vector<std::string> GetOpInputs(const framework::OpDesc *op);
-  const std::vector<std::string> &GetOpOutputs(const framework::OpDesc *op);
-  popart::DebugContext BuildDebugContext(const framework::OpDesc *op);
+  std::vector<std::string> GetOpInputs(const OpDesc *op);
+  const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
+  popart::DebugContext BuildDebugContext(const OpDesc *op);

 private:
  std::unique_ptr<popart::Builder> builder_;
+  std::unique_ptr<CompilerResources> resources_;

-  using OpFunc = std::function<void(framework::OpDesc *op_desc)>;
+  using OpFunc = std::function<void(OpDesc *op_desc)>;
  std::unordered_map<std::string, OpFunc> name_function_;

-  // stateful variable
-  std::map<std::string, popart::TensorId> tensors_;
-
  // feed_list_ & fetch_list save paddle tensor id
  std::vector<std::string> feed_list_;
  std::vector<std::string> fetch_list_;

-  // inputs_ & outputs_ save popart tensor id
-  std::vector<popart::TensorId> inputs_;
-  std::vector<popart::TensorId> outputs_;
-
-  // weights info map
-  std::vector<popart::TensorId> weights_;
-
-  std::string converted_proto_ = "";
  const IpuStrategy *ipu_strategy_ = nullptr;
+  std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
 };

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/device.cc
+++ b/paddle/fluid/platform/device/ipu/device.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,26 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/device/ipu/device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

-Device::Device(const popart::DeviceInfo& device_info)
-    : id_(device_info.getId()), is_attached_(device_info.isAttached()) {
-  popart::DeviceType popart_device_type = device_info.getType();
-  switch (popart_device_type) {
-    case popart::DeviceType::IpuModel:
-      device_type_ = DeviceType::IpuModel;
-      break;
-    case popart::DeviceType::Ipu:
-      device_type_ = DeviceType::Ipu;
-      break;
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "popart::DeviceType:Unsupported type %d", popart_device_type));
+int GetNumDevices() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return 1;
  }
+  int num_devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
+  PADDLE_ENFORCE_GT(num_devices, 0, platform::errors::Unavailable(
+                                        "Do not found any IPU devices, please "
+                                        "make sure Poplar sdk is enabled"));
+  return num_devices;
+}
+
+std::vector<int> GetDeviceIds() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return {0};
+  }
+  std::vector<int> device_ids;
+  auto devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices();
+  PADDLE_ENFORCE_GT(
+      devices.size(), 0,
+      platform::errors::Unavailable("Do not found any IPU devices, please make "
+                                    "sure Poplar sdk is enabled."));
+  for (auto device : devices) {
+    device_ids.push_back(device->getId());
+  }
+  return device_ids;
 }

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/device.h
+++ b/paddle/fluid/platform/device/ipu/device.h
@@ -21,23 +21,11 @@ namespace paddle {
 namespace platform {
 namespace ipu {

-enum class DeviceType { IpuModel = 0, Cpu, Ipu, OfflineIpu, Sim };
-
-class Device {
- public:
-  Device() {}
-  explicit Device(const popart::DeviceInfo& device_info);
-
-  int getId() const { return id_; }
-  bool isAttached() const { return is_attached_; }
-  DeviceType getType() const { return device_type_; }
-
- private:
-  int id_;
-  bool is_attached_;
-  DeviceType device_type_;
-  /* TODO:: Add more elements in the future */
-};
+// get the number of all avaliable IPUs
+int GetNumDevices();
+
+// get the device id of all avaliable IPUs
+std::vector<int> GetDeviceIds();

 }  // namespace ipu
 }  // namespace platform

--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -10,23 +10,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"

 namespace paddle {
 namespace platform {

 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedIPUDevices() {
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  return ipu_backend->GetDeviceIds();
+  return platform::ipu::GetDeviceIds();
 }

 //! Get the total number of IPU devices in system.
-int GetIPUDeviceCount() {
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  return ipu_backend->GetNumDevices();
-}
+int GetIPUDeviceCount() { return platform::ipu::GetNumDevices(); }
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/ipu_info.h
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
@@ -17,8 +17,10 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
+
 std::vector<int> GetSelectedIPUDevices();
 int GetIPUDeviceCount();
+
 }  // namespace platform
 }  // namespace paddle
 #endif
--- a/paddle/fluid/platform/device/ipu/common.h
+++ b/paddle/fluid/platform/device/ipu/common.h
@@ -22,6 +22,8 @@ namespace ipu {

 static constexpr const char *sIpuIndexAttr = "ipu_index";
 static constexpr const char *sIpuStageAttr = "ipu_stage";
+static constexpr const char *sMatmulSerializeFactor = "serialize_factor";
+static constexpr const char *sMatmulSerializeMode = "serialize_mode";
 static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
 static constexpr const char *sDebugInfoId = "__debug_info_id";

@@ -29,6 +31,7 @@ static constexpr const char *sBeta1 = "beta1";
 static constexpr const char *sBeta2 = "beta2";
 static constexpr const char *sBeta1Pow = "Beta1Pow";
 static constexpr const char *sBeta2Pow = "Beta2Pow";
+static constexpr const char *sLossScaling = "LossScaling";

 }  // namespace ipu
 }  // namespace platform

--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -12,10 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include <glog/logging.h>

 namespace paddle {
 namespace platform {
-namespace ipu {}  // namespace ipu
+namespace ipu {
+
+void IpuStrategy::enablePattern(const std::string& t) {
+  VLOG(10) << "enable popart pattern: " << t;
+  popart_patterns.enablePattern(t, true);
+}
+
+void IpuStrategy::disablePattern(const std::string& t) {
+  VLOG(10) << "disable popart pattern: " << t;
+  popart_patterns.enablePattern(t, false);
+}
+
+const bool IpuStrategy::isPatternEnabled(const std::string& t) {
+  return popart_patterns.isPatternEnabled(t);
+}
+
+}  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -14,24 +14,86 @@ limitations under the License. */

 #pragma once

+#include <popart/op.hpp>
 #include <popart/sessionoptions.hpp>
+#include <popart/tensorlocation.hpp>
+#include "popart/patterns/patterns.hpp"

 namespace paddle {
 namespace platform {
 namespace ipu {

 using VirtualGraphMode = popart::VirtualGraphMode;
+using RecomputationType = popart::RecomputationType;

 struct IpuStrategy {
+  IpuStrategy() {
+    // we always save optimizer state to OffChip and enable rts for saving
+    // memory
+    auto storage = popart::TensorLocation(popart::TensorStorage::OffChip,
+                                          popart::ReplicatedTensorSharding::On);
+    popart_options.optimizerStateTensorLocationSettings =
+        popart::TensorLocationSettings(storage);
+
+    // We divide the accumulationFactor and replicatedGraphCount after all
+    // reduce
+    popart_options.accumulationAndReplicationReductionType =
+        popart::ReductionType::Mean;
+    popart_options.meanAccumulationAndReplicationReductionStrategy =
+        popart::MeanReductionStrategy::Post;
+
+    popart_options.enableFloatingPointChecks = false;
+
+    // A directory for log traces to be written into.
+    popart_options.logDir = "popart_log";
+  }
+  ~IpuStrategy() {}
+
+  // Number ipus total needed, replica * ipu_per_replica
  int num_ipus = 1;
+
+  // batches per step
  int batches_per_step = 1;
-  int batch_size = 1;
+
+  // micro batch-size
+  int micro_batch_size = 1;
+
+  // training flag, true for training
  bool is_training = true;
+
+  // save the onnx model lowered by paddle program description
  bool save_init_onnx = false;
-  bool save_last_onnx = true;
-  popart::SessionOptions popart_options_;
+
+  // save the trained model
+  bool save_onnx_checkpoint = false;
+
+  // save paddle model per n steps
+  int save_per_n_step = 1;
+
+  // average sharding, debugging used
  bool need_avg_shard = false;
+
+  // flag for fp16, true for pure fp16
  bool enable_fp16 = false;
+
+  // available memory proportion, 0.0f for disable
+  float available_memory_proportion = 0.0f;
+
+  // loss scaling, currently we can't get loss scaling from
+  // optimizer_extract_pass, so we have to set it here
+  float loss_scaling = 1.0f;
+
+  // defaultMaxWeightNorm for adam optimizer
+  float max_weight_norm = 65504.0f;
+
+  // popart session option
+  popart::SessionOptions popart_options;
+  popart::Patterns popart_patterns;
+
+ public:
+  void enablePattern(const std::string& t);
+  void disablePattern(const std::string& t);
+  const bool isPatternEnabled(const std::string& t);
 };

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/ipu_utils.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -12,22 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+#include <cmath>

 namespace paddle {
 namespace platform {
 namespace ipu {

-void* PaddleIArray::data() { return tensor_->data(); }
+void* PaddleIArray::data() { return tensor_.data(); }

 popart::DataType PaddleIArray::dataType() const {
-  return VarType2PopartType(tensor_->type());
+  return VarType2PopartType(tensor_.type());
 }

-std::size_t PaddleIArray::rank() const { return tensor_->dims().size(); }
+std::size_t PaddleIArray::rank() const { return tensor_.dims().size(); }

 int64_t PaddleIArray::dim(size_t index) const {
-  return tensor_->dims().at(index);
+  return tensor_.dims().at(index);
 }

 std::size_t PaddleIArray::nelms() const {
@@ -150,6 +151,32 @@ bool GetBoolEnv(std::string str) {
  }
 }

+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string& opt_type) {
+  // format: {popart_tensor_id, paddle_tensor_id}, ...
+  std::vector<std::pair<std::string, std::string>> pre_post_fix;
+
+  if (opt_type == "adam" || opt_type == "lamb") {
+    pre_post_fix.push_back(std::make_pair("", ""));
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
+    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+  } else if (opt_type == "sgd" || opt_type == "momentum") {
+    // sgd
+    pre_post_fix.push_back(std::make_pair("", ""));
+  } else {
+    pre_post_fix.push_back(std::make_pair("", ""));
+    //
+  }
+
+  return pre_post_fix;
+}
+
+int RequestIpus(const int num_ipus) {
+  // num_ipus must be pow(2, n);
+  return std::pow(2, ceil(log2(num_ipus)));
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -17,14 +17,27 @@ limitations under the License. */
 #include <popart/ndarraywrapper.hpp>
 #include <popart/tensordata.hpp>
 #include <popart/tensorinfo.hpp>
+#include <popart/vendored/any.hpp>

-#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

+using float16 = platform::float16;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using Scope = framework::Scope;
+using OpDesc = framework::OpDesc;
+using Graph = framework::ir::Graph;
+using Node = framework::ir::Node;
+using BlockDesc = framework::BlockDesc;
+
 // onnx dtype
 // https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
 enum ONNXDataType : int {
@@ -49,14 +62,15 @@ enum ONNXDataType : int {

 class PaddleIArray final : public popart::IArray {
 public:
-  explicit PaddleIArray(framework::Tensor *tensor) : tensor_(tensor) {
+  explicit PaddleIArray(const Tensor* tensor) {
+    tensor_.ShareDataWith(*tensor);
    for (int i = 0; i < tensor->dims().size(); ++i) {
      shape_.push_back(tensor->dims().at(i));
    }
  }

 public:
-  void *data();
+  void* data();
  popart::DataType dataType() const;
  std::size_t rank() const;
  int64_t dim(size_t index) const;
@@ -64,7 +78,7 @@ class PaddleIArray final : public popart::IArray {
  const popart::Shape shape() const;

 private:
-  framework::Tensor *tensor_;
+  Tensor tensor_;
  std::vector<int64_t> shape_;
 };

@@ -74,8 +88,7 @@ popart::DataType OnnxDtype2PopartType(const int type);
 bool GetBoolEnv(std::string str);

 template <typename T>
-std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
-    const framework::Tensor &tensor) {
+std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(const Tensor& tensor) {
  auto dtype = VarType2PopartType(tensor.type());
  auto shape = std::vector<int64_t>();
  for (size_t i = 0; i < tensor.dims().size(); ++i) {
@@ -84,18 +97,140 @@ std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
  popart::TensorInfo tensor_info(dtype, shape);

  return std::make_unique<popart::NDArrayWrapper<T>>(
-      reinterpret_cast<T *>(tensor.data()), tensor_info);
+      reinterpret_cast<T*>(tensor.data()), tensor_info);
 }

 template <typename T>
 std::unique_ptr<popart::NDArrayWrapper<T>> LoDTensor2IArray(
-    framework::LoDTensor const &lod_tensor) {
-  PADDLE_ENFORCE_EQ(
-      lod_tensor.lod().size(), 0UL,
-      platform::errors::InvalidArgument("LoDTensor2IArray is Unimplemented"));
-  return Tensor2IArray<T>(lod_tensor);
+    LoDTensor const& lod_tensor) {
+  if (lod_tensor.lod().size() == 0) {
+    return Tensor2IArray<T>(lod_tensor);
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("LoDTensor2IArray is Unimplemented"));
+  }
+}
+
+template <typename T>
+T GetSingleVarFromScope(const Scope* scope, const std::string& var_name) {
+  auto var = scope->GetVar(var_name);
+  auto tensor = var->Get<framework::LoDTensor>();
+  // check dtype is  ?
+  return tensor.data<T>()[0];
 }

+struct CustomOpAttrVisitor : public boost::static_visitor<void> {
+  explicit CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
+                               const std::string& attr_name)
+      : attrs_(attr), attr_name_(attr_name) {}
+  mutable std::map<std::string, popart::any>* attrs_;
+  std::string attr_name_;
+
+  void operator()(int v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(float v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::string& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<int>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<float>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<std::string>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(bool v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<bool>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(BlockDesc* desc) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type."));
+  }
+  void operator()(const std::vector<BlockDesc*>& v) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type."));
+  }
+  void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<int64_t>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<double>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `boost::blank` type."));
+  }
+};
+
+struct IpuCustomOpIdentifier {
+  IpuCustomOpIdentifier(const std::string& _paddle_op,
+                        const std::string& _popart_op,
+                        const std::string& _domain, unsigned int _version)
+      : paddle_op(_paddle_op), popart_op(_domain, _popart_op, _version) {}
+
+  std::string repr() {
+    std::ostringstream os;
+    os << "paddle_op: " << paddle_op << ", domain: " << popart_op.domain
+       << ", type: " << popart_op.type << ", version: " << popart_op.version;
+    return os.str();
+  }
+
+  std::string paddle_op;
+  popart::OperatorIdentifier popart_op;
+};
+
+struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
+  explicit ConstantOpAttrVisitor(framework::LoDTensor* tensor,
+                                 framework::proto::VarType::Type dtype)
+      : tensor_(tensor), dtype_(dtype) {}
+  framework::LoDTensor* tensor_;
+  framework::proto::VarType::Type dtype_;
+
+  void operator()(const std::vector<int>& vec) const {
+    framework::TensorFromVector<int>(vec, tensor_);
+  }
+  void operator()(const std::vector<float>& vec) const {
+    if (dtype_ == framework::proto::VarType::FP16) {
+      std::vector<float16> vec_fp16;
+      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
+                     [](float f) -> float16 { return float16(f); });
+      framework::TensorFromVector<float16>(vec_fp16, tensor_);
+    } else {
+      framework::TensorFromVector<float>(vec, tensor_);
+    }
+  }
+  void operator()(const std::vector<bool>& vec) const {
+    framework::TensorFromVector<bool>(vec, tensor_);
+  }
+  void operator()(const std::vector<int64_t>& vec) const {
+    framework::TensorFromVector<int64_t>(vec, tensor_);
+  }
+  void operator()(const std::vector<double>& vec) const {
+    framework::TensorFromVector<double>(vec, tensor_);
+  }
+  void RaiseError() const {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Constant value must be a vector"));
+  }
+  void operator()(int v) const { RaiseError(); }
+  void operator()(float v) const { RaiseError(); }
+  void operator()(const std::string& v) const { RaiseError(); }
+  void operator()(const std::vector<std::string>& v) const { RaiseError(); }
+  void operator()(bool v) const { RaiseError(); }
+  void operator()(BlockDesc* desc) const { RaiseError(); }
+  void operator()(const std::vector<BlockDesc*>& v) const { RaiseError(); }
+  void operator()(int64_t v) const { RaiseError(); }
+  void operator()(boost::blank) const { RaiseError(); }
+};
+
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string& opt_type);
+
+int RequestIpus(const int num_ipus);
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/supported_ops_custom.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// clang-format off
+
+#pragma once
+
+OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT
+
+// clang-format on