[IPU] update ipu_backend p0 (#38854)

* update ipu_backend * sync with paddle internal Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Allen Guo <alleng@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai> * apply comments 01 * update error messag * restore ipu_executor and ipu_optimizer * add clang-format on Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai>

[IPU] update ipu_backend p0 (#38854)
* update ipu_backend * sync with paddle internal Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Allen Guo <alleng@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai> * apply comments 01 * update error messag * restore ipu_executor and ipu_optimizer * add clang-format on Co-authored-by: N Xiaobing Wang <xiaobingw@graphcore.ai> Co-authored-by: N Zhixin Yao <zhixiny@graphcore.ai> Co-authored-by: N Haicheng Jiang <haichengj@graphcore.ai> Co-authored-by: N Han Zhao <hanzhao@graphcore.ai>
b2aee3e3 · Allen Guo · GitHub · b4cb3589 · b2aee3e3 · b2aee3e3
15 changed file
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
 IF(WITH_IPU)
  FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
-  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
-  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
-  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
-  cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
-  cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
-  cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
-  cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
-  cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+  list(APPEND PADDLE_IPU_SRC ${POPART_CANONICALIZATION_SRC})
+  set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
+  set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
+  set(IPU_BACKEND_SRC
+    "ipu_device.cc"
+    "ipu_strategy.cc"
+    "ipu_executor.cc"
+    "ipu_compiler.cc"
+    "ipu_backend.cc"
+    "ipu_utils.cc"
+  )
+
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph framework_proto enforce graph_helper timer)
  cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
+  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart)
+  add_dependencies(paddle_ipu ipu_backend)
 ENDIF()
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"

 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -24,170 +24,92 @@ namespace paddle {
 namespace platform {
 namespace ipu {

-std::shared_ptr<IpuBackend> IpuBackend::instance_ = nullptr;
+IpuBackend* IpuBackend::GetInstance() {
+  static IpuBackend instance;
+  return &instance;
+}

 IpuBackend::IpuBackend() {
-  compiler_ = std::make_shared<Compiler>();
+  compiler_ = std::make_unique<Compiler>();
  executor_ = std::make_unique<Executor>();
 }

-void IpuBackend::Clear() {
+IpuBackend::~IpuBackend() {
+  compiler_.reset();
  executor_.reset();
-  // detach device
-  if (device_ != nullptr && device_->isAttached()) {
-    device_->detach();
-    device_.reset();
-    device_ = nullptr;
-  }
-}
-
-IpuBackend::~IpuBackend() { Clear(); }
-
-std::shared_ptr<IpuBackend> IpuBackend::GetInstance() {
-  if (!instance_) {
-    instance_.reset(new IpuBackend());
-  }
-  return instance_;
-}
-
-// This api should only call from python, always return a new object
-std::shared_ptr<IpuBackend> IpuBackend::GetNewInstance() {
-  instance_.reset(new IpuBackend());
-  return instance_;
 }

-void IpuBackend::Compile(framework::ir::Graph* graph,
+void IpuBackend::Compile(Graph* graph,
                         const std::vector<std::string>& feed_list,
                         const std::vector<std::string>& fetch_list) {
  VLOG(10) << "enter IpuBackend::Compile";
+  compiler_->Prepare();
+  executor_->SetCompilerResources(compiler_->GetResources());
+
  compiler_->InitInputs(graph, feed_list);
+  compiler_->LowerConstants(graph, scope_);
  compiler_->LowerWeights(graph, scope_);
  compiler_->LowerBody(graph);
  compiler_->InitOutputs(fetch_list);
-  executor_->SetWeights(compiler_->GetWeights());
+  if (ipu_strategy_->is_training) {
+    compiler_->LowerOptimier(graph, scope_);
+  }
+  is_compiled_ = true;
+  // when call compile, means a new graph
+  is_prepared_ = false;
  VLOG(10) << "leave IpuBackend::Compile";
 }

-void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
-                     const std::vector<framework::Tensor*>& outputs,
+void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
+                     const std::vector<Tensor*>& outputs,
                     const framework::ExecutionContext& ctx) {
  Prepare();
-  auto inputs_id = compiler_->GetInputs();
-  auto outputs_id = compiler_->GetOutputs();
-  executor_->Run(inputs_id, inputs, outputs_id, outputs, ctx);
+  timer_->Start();
+  executor_->Run(inputs, outputs, ctx);
+  timer_->Pause();
+  VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)";
 }

 void IpuBackend::Prepare() {
-  if (is_prepared_) {
-    return;
-  } else {
+  if (!is_prepared_) {
+    executor_->Prepare(compiler_->GetModelProto());
+    timer_.reset(new platform::Timer());
    is_prepared_ = true;
  }
-  // convert Model to fp16
-  if (ipu_strategy_->enable_fp16) {
-    compiler_->ConvertProtoToFp16();
-  }
-  auto proto = compiler_->GetModelProto();
-  auto tensors = compiler_->GetTensors();
-  auto outputs = compiler_->GetOutputs();
-  executor_->Prepare(proto, tensors, outputs, device_);
 }

-void IpuBackend::SetScope(const framework::Scope& scope) {
+void IpuBackend::Detach() { executor_->Detach(); }
+
+void IpuBackend::Reset() {
+  executor_->Detach();
+  compiler_.reset();
+  executor_.reset();
+}
+
+void IpuBackend::SetScope(const Scope& scope) {
  scope_ = &scope;
  executor_->SetScope(&scope);
 }

 void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
  ipu_strategy_ = &strategy;
-  executor_->SetIpuStrategy(strategy);
  compiler_->SetIpuStrategy(strategy);
+  executor_->SetIpuStrategy(strategy);
 }

-size_t IpuBackend::GetNumDevices() {
-  // IpuModel
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) return 1;
-  // Real dev
-  size_t num_devices =
-      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
-  PADDLE_ENFORCE_GT(
-      num_devices, 0,
-      platform::errors::Unavailable(
-          "Do not found any IPU devices, please make "
-          "sure Poplar sdk is enabled or enable ENV \"POPLAR_IPUMODEL=1\""));
-  return num_devices;
-}
-
-std::vector<int> IpuBackend::GetDeviceIds() {
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    return {0};
-  }
-  std::vector<int> device_ids;
-  auto devices =
-      popart::DeviceManager::createDeviceManager().enumerateDevices();
-  PADDLE_ENFORCE_GT(
-      devices.size(), 0,
-      platform::errors::Unavailable("Do not found any IPU devices, please make "
-                                    "sure Poplar sdk is enabled."));
-
-  for (auto device : devices) {
-    device_ids.push_back(device->getId());
-  }
-
-  return device_ids;
-}
-
-Device IpuBackend::GetDevice(int id) {
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    std::map<std::string, std::string> deviceOpts{{"numIPUs", "1 "}};
-    device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
-        deviceOpts);
-    Device device(*device_.get());
-    return device;
-  }
-  size_t num_devices = GetNumDevices();
-  if (id < 0 || id >= num_devices) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "device id %d is invalid, number devices is %d", id, num_devices));
-  }
-  std::shared_ptr<popart::DeviceInfo> popart_device_info =
-      popart::DeviceManager::createDeviceManager().getDevice(
-          popart::SyncPattern::Full, id);
-  Device device(*popart_device_info.get());
-  return device;
-}
-
-void IpuBackend::AttachDevice(int id) {
-  // trick here
-  // Compiler ipu is not same as the runtime ipu.
-  VLOG(10) << "comile ipu id = " << id;
-  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
-  if (ipu_model) {
-    return;
-  }
-  device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
-      UpperIpuNum());
-  PADDLE_ENFORCE_NOT_NULL(
-      device_, platform::errors::Unavailable("Can't attach IPU, ipu_num = %d.",
-                                             UpperIpuNum()));
+void IpuBackend::SetCustomOps(
+    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
+  compiler_->SetCustomOps(custom_ops);
 }

-bool IpuBackend::DeviceIsAttached() { return device_ != nullptr; }
-
-// num_ipus must be pow(2,n);
-int IpuBackend::UpperIpuNum() {
-  PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0,
-                    platform::errors::Unavailable(
-                        "The ipu num get is wrong, please make sure the "
-                        "sharding or pipline parameter is right."));
-  int i = 0;
-  while (std::pow(2, i) < ipu_strategy_->num_ipus) {
-    i++;
+void IpuBackend::SaveMoldeProto(const std::string& path) {
+  if (ipu_strategy_->is_training && is_prepared_) {
+    executor_->SaveModelToHost(path);
+  } else if (is_compiled_) {
+    compiler_->SaveModelProtoNoCheck(path);
+  } else {
+    LOG(WARNING) << "Model is empty";
  }
-  return std::pow(2, i);
 }

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -14,88 +14,86 @@ limitations under the License. */

 #pragma once

-#include <cmath>
 #include <popart/devicemanager.hpp>
 #include <popart/names.hpp>
+#include <popart/tensorinfo.hpp>

-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_executor.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/ipu/device.h"
-#include "paddle/fluid/platform/ipu/ipu_compiler.h"
-#include "paddle/fluid/platform/ipu/ipu_executor.h"
-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/timer.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

+// IpuBackend is the center of paddle-ipu, its function include:
+//   1. Compile paddle model to popart model
+//   2. Run popart model, inference or training
+//   3. Request and release device
+//   4. Other helper function
 class IpuBackend {
-  // IpuBackend is the center of paddle-ipu, its function include:
-  //   1. Compile paddle model to popart model
-  //   2. Run popart model, inference or training
-  //   3. Request and release device
-  //   4. Other helper function
+ public:
+  static IpuBackend *GetInstance();

 public:
  IpuBackend();
  ~IpuBackend();

-  void Clear();
-
-  // return if exsits, else create and return
-  static std::shared_ptr<IpuBackend> GetInstance();
-
-  // always return a new instance_
-  static std::shared_ptr<IpuBackend> GetNewInstance();
-
  // what compile does include(call compiler_):
  //   1. map paddle-op -> poart op
  //   2. construct popart onnx compute graph
-  void Compile(framework::ir::Graph *graph,
-               const std::vector<std::string> &feed_list,
+  void Compile(Graph *graph, const std::vector<std::string> &feed_list,
               const std::vector<std::string> &fetch_list);

  // what run does include:
  //   1. construct forward onnx graph
  //   2. graph-level optimization
  //   3. autodiff
-  void Run(const std::vector<const framework::Tensor *> &inputs,
-           const std::vector<framework::Tensor *> &outputs,
+  void Run(const std::vector<const Tensor *> &inputs,
+           const std::vector<Tensor *> &outputs,
           const framework::ExecutionContext &ctx);

-  Executor &GetExecutor() { return *executor_; }
+  // detach IPU manually
+  void Detach();
+
+  // reset manually
+  // call it before destruct works
+  void Reset();

-  void SetScope(const framework::Scope &scope);
-  const framework::Scope *GetScope() { return scope_; }
+  void SetScope(const Scope &scope);
+  const Scope *GetScope() { return scope_; }
  void SetIpuStrategy(const IpuStrategy &strategy);
  const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
+  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);

-  // Device
-  size_t GetNumDevices();
-  std::vector<int> GetDeviceIds();
-  Device GetDevice(int id);
-  void AttachDevice(int id);
-  bool DeviceIsAttached();
+  // save compiled model to onnx
+  void SaveMoldeProto(const std::string &path);

 private:
-  int UpperIpuNum();
  void Prepare();

 private:
-  std::shared_ptr<Compiler> compiler_;
+  std::unique_ptr<Compiler> compiler_;
  std::unique_ptr<Executor> executor_;
-  std::shared_ptr<popart::DeviceInfo> device_;
+  bool is_compiled_ = false;
  bool is_prepared_ = false;

  // not own
-  const framework::Scope *scope_ = nullptr;
+  const Scope *scope_ = nullptr;
  const IpuStrategy *ipu_strategy_ = nullptr;

 private:
-  static std::shared_ptr<IpuBackend> instance_;
+  // time record for IpuBackend::Run
+  std::unique_ptr<platform::Timer> timer_;
+
+  DISABLE_COPY_AND_ASSIGN(IpuBackend);
 };

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -16,76 +16,119 @@

 #include <popart/builder.hpp>
 #include <popart/graphtransformer.hpp>
+#include <popart/optimizer.hpp>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/ipu/common.h"
-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

+struct CompilerResources {
+  // popart input tensor_ids
+  std::vector<popart::TensorId> inputs;
+  // popart output tensor_ids
+  std::vector<popart::TensorId> outputs;
+  // <paddle_var_name, popart_tensor_ids>
+  std::map<std::string, popart::TensorId> tensors;
+  // popart_weight_ids
+  std::vector<popart::TensorId> weights;
+  // popart loss tensor_id
+  popart::TensorId loss_var;
+  // paddle lr var_name
+  std::string lr_var;
+  // lr value
+  float lr;
+  // flag for lr is constant or scheduling
+  bool with_lr_sched = false;
+  // paddle optimizer type, eg: momentum, lamb
+  std::string optimizer_type;
+
+  using OptimizerFn =
+      std::function<std::unique_ptr<popart::Optimizer>(float lr)>;
+  OptimizerFn optimizer_fn;
+
+ public:
+  popart::Optimizer *Optimizer() { return optimizer.get(); }
+
+  popart::Optimizer *NewOptimizer() {
+    optimizer = optimizer_fn(lr);
+    return optimizer.get();
+  }
+
+  popart::Optimizer *UpdateOptimizer(float lr_new) {
+    optimizer = optimizer_fn(lr_new);
+    return optimizer.get();
+  }
+
+ private:
+  std::unique_ptr<popart::Optimizer> optimizer;
+};
+
 class Compiler {
 public:
  Compiler();
  ~Compiler();
+
  void RegisterOpFunc();
-  void LowerBody(const framework::ir::Graph *graph);
-  void InitInputs(framework::ir::Graph *graph,
-                  const std::vector<std::string> &feed_list);
+  void Prepare();
+  void LowerBody(const Graph *graph);
+  void InitInputs(Graph *graph, const std::vector<std::string> &feed_list);
  void InitOutputs(const std::vector<std::string> &fetch_list);
-  void LowerWeights(const framework::ir::Graph *graph,
-                    const framework::Scope *scope_);
+  void LowerConstants(const Graph *graph, const Scope *scope);
+  void LowerWeights(const Graph *graph, const Scope *scope);
+  void LowerOptimier(const Graph *graph, const Scope *scope);

  void InsertTensors(const std::vector<std::string> &output_names,
                     const std::vector<std::string> &tensor_ids);
  void InsertTensors(const std::vector<std::string> &output_names,
                     const std::string &tensor_id);
  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const framework::OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id,
-                        const framework::OpDesc *op_desc);
-
-  std::vector<popart::TensorId> GetInputs() { return inputs_; }
-  std::vector<popart::TensorId> GetOutputs() { return outputs_; }
-  std::map<std::string, popart::TensorId> GetTensors() { return tensors_; }
-  std::vector<popart::TensorId> &GetWeights();
+                        const OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
+                              const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::string &tensor_id,
+                              const OpDesc *op_desc);

-  std::string GetModelProto();
  void SetIpuStrategy(const IpuStrategy &strategy) {
    ipu_strategy_ = &strategy;
-  };
+  }
+
+  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);
+
+  CompilerResources *GetResources() { return resources_.get(); }
+
+  std::string GetModelProto();
+  std::string GetFP16ModelProto();
+
  void SaveModelProto(const std::string &path);
  void SaveModelProtoNoCheck(const std::string &path);
-  void ConvertProtoToFp16();

 private:
-  std::vector<std::string> GetOpInputs(const framework::OpDesc *op);
-  const std::vector<std::string> &GetOpOutputs(const framework::OpDesc *op);
-  popart::DebugContext BuildDebugContext(const framework::OpDesc *op);
+  std::vector<std::string> GetOpInputs(const OpDesc *op);
+  const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
+  popart::DebugContext BuildDebugContext(const OpDesc *op);

 private:
  std::unique_ptr<popart::Builder> builder_;
+  std::unique_ptr<CompilerResources> resources_;

-  using OpFunc = std::function<void(framework::OpDesc *op_desc)>;
+  using OpFunc = std::function<void(OpDesc *op_desc)>;
  std::unordered_map<std::string, OpFunc> name_function_;

-  // stateful variable
-  std::map<std::string, popart::TensorId> tensors_;
-
  // feed_list_ & fetch_list save paddle tensor id
  std::vector<std::string> feed_list_;
  std::vector<std::string> fetch_list_;

-  // inputs_ & outputs_ save popart tensor id
-  std::vector<popart::TensorId> inputs_;
-  std::vector<popart::TensorId> outputs_;
-
-  // weights info map
-  std::vector<popart::TensorId> weights_;
-
-  std::string converted_proto_ = "";
  const IpuStrategy *ipu_strategy_ = nullptr;
+  std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
 };

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/device.cc
+++ b/paddle/fluid/platform/device/ipu/device.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,26 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/device/ipu/device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

-Device::Device(const popart::DeviceInfo& device_info)
-    : id_(device_info.getId()), is_attached_(device_info.isAttached()) {
-  popart::DeviceType popart_device_type = device_info.getType();
-  switch (popart_device_type) {
-    case popart::DeviceType::IpuModel:
-      device_type_ = DeviceType::IpuModel;
-      break;
-    case popart::DeviceType::Ipu:
-      device_type_ = DeviceType::Ipu;
-      break;
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "popart::DeviceType:Unsupported type %d", popart_device_type));
+int GetNumDevices() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return 1;
  }
+  int num_devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
+  PADDLE_ENFORCE_GT(num_devices, 0, platform::errors::Unavailable(
+                                        "Do not found any IPU devices, please "
+                                        "make sure Poplar sdk is enabled"));
+  return num_devices;
+}
+
+std::vector<int> GetDeviceIds() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return {0};
+  }
+  std::vector<int> device_ids;
+  auto devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices();
+  PADDLE_ENFORCE_GT(
+      devices.size(), 0,
+      platform::errors::Unavailable("Do not found any IPU devices, please make "
+                                    "sure Poplar sdk is enabled."));
+  for (auto device : devices) {
+    device_ids.push_back(device->getId());
+  }
+  return device_ids;
 }

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/device.h
+++ b/paddle/fluid/platform/device/ipu/device.h
@@ -21,23 +21,11 @@ namespace paddle {
 namespace platform {
 namespace ipu {

-enum class DeviceType { IpuModel = 0, Cpu, Ipu, OfflineIpu, Sim };
-
-class Device {
- public:
-  Device() {}
-  explicit Device(const popart::DeviceInfo& device_info);
-
-  int getId() const { return id_; }
-  bool isAttached() const { return is_attached_; }
-  DeviceType getType() const { return device_type_; }
-
- private:
-  int id_;
-  bool is_attached_;
-  DeviceType device_type_;
-  /* TODO:: Add more elements in the future */
-};
+// get the number of all avaliable IPUs
+int GetNumDevices();
+
+// get the device id of all avaliable IPUs
+std::vector<int> GetDeviceIds();

 }  // namespace ipu
 }  // namespace platform

--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -10,23 +10,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_device.h"

 namespace paddle {
 namespace platform {

 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedIPUDevices() {
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  return ipu_backend->GetDeviceIds();
+  return platform::ipu::GetDeviceIds();
 }

 //! Get the total number of IPU devices in system.
-int GetIPUDeviceCount() {
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  return ipu_backend->GetNumDevices();
-}
+int GetIPUDeviceCount() { return platform::ipu::GetNumDevices(); }
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/ipu_info.h
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
@@ -17,8 +17,10 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
+
 std::vector<int> GetSelectedIPUDevices();
 int GetIPUDeviceCount();
+
 }  // namespace platform
 }  // namespace paddle
 #endif
--- a/paddle/fluid/platform/device/ipu/common.h
+++ b/paddle/fluid/platform/device/ipu/common.h
@@ -22,6 +22,8 @@ namespace ipu {

 static constexpr const char *sIpuIndexAttr = "ipu_index";
 static constexpr const char *sIpuStageAttr = "ipu_stage";
+static constexpr const char *sMatmulSerializeFactor = "serialize_factor";
+static constexpr const char *sMatmulSerializeMode = "serialize_mode";
 static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
 static constexpr const char *sDebugInfoId = "__debug_info_id";

@@ -29,6 +31,7 @@ static constexpr const char *sBeta1 = "beta1";
 static constexpr const char *sBeta2 = "beta2";
 static constexpr const char *sBeta1Pow = "Beta1Pow";
 static constexpr const char *sBeta2Pow = "Beta2Pow";
+static constexpr const char *sLossScaling = "LossScaling";

 }  // namespace ipu
 }  // namespace platform

--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -12,10 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include <glog/logging.h>

 namespace paddle {
 namespace platform {
-namespace ipu {}  // namespace ipu
+namespace ipu {
+
+void IpuStrategy::enablePattern(const std::string& t) {
+  VLOG(10) << "enable popart pattern: " << t;
+  popart_patterns.enablePattern(t, true);
+}
+
+void IpuStrategy::disablePattern(const std::string& t) {
+  VLOG(10) << "disable popart pattern: " << t;
+  popart_patterns.enablePattern(t, false);
+}
+
+const bool IpuStrategy::isPatternEnabled(const std::string& t) {
+  return popart_patterns.isPatternEnabled(t);
+}
+
+}  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -14,24 +14,86 @@ limitations under the License. */

 #pragma once

+#include <popart/op.hpp>
 #include <popart/sessionoptions.hpp>
+#include <popart/tensorlocation.hpp>
+#include "popart/patterns/patterns.hpp"

 namespace paddle {
 namespace platform {
 namespace ipu {

 using VirtualGraphMode = popart::VirtualGraphMode;
+using RecomputationType = popart::RecomputationType;

 struct IpuStrategy {
+  IpuStrategy() {
+    // we always save optimizer state to OffChip and enable rts for saving
+    // memory
+    auto storage = popart::TensorLocation(popart::TensorStorage::OffChip,
+                                          popart::ReplicatedTensorSharding::On);
+    popart_options.optimizerStateTensorLocationSettings =
+        popart::TensorLocationSettings(storage);
+
+    // We divide the accumulationFactor and replicatedGraphCount after all
+    // reduce
+    popart_options.accumulationAndReplicationReductionType =
+        popart::ReductionType::Mean;
+    popart_options.meanAccumulationAndReplicationReductionStrategy =
+        popart::MeanReductionStrategy::Post;
+
+    popart_options.enableFloatingPointChecks = false;
+
+    // A directory for log traces to be written into.
+    popart_options.logDir = "popart_log";
+  }
+  ~IpuStrategy() {}
+
+  // Number ipus total needed, replica * ipu_per_replica
  int num_ipus = 1;
+
+  // batches per step
  int batches_per_step = 1;
-  int batch_size = 1;
+
+  // micro batch-size
+  int micro_batch_size = 1;
+
+  // training flag, true for training
  bool is_training = true;
+
+  // save the onnx model lowered by paddle program description
  bool save_init_onnx = false;
-  bool save_last_onnx = true;
-  popart::SessionOptions popart_options_;
+
+  // save the trained model
+  bool save_onnx_checkpoint = false;
+
+  // save paddle model per n steps
+  int save_per_n_step = 1;
+
+  // average sharding, debugging used
  bool need_avg_shard = false;
+
+  // flag for fp16, true for pure fp16
  bool enable_fp16 = false;
+
+  // available memory proportion, 0.0f for disable
+  float available_memory_proportion = 0.0f;
+
+  // loss scaling, currently we can't get loss scaling from
+  // optimizer_extract_pass, so we have to set it here
+  float loss_scaling = 1.0f;
+
+  // defaultMaxWeightNorm for adam optimizer
+  float max_weight_norm = 65504.0f;
+
+  // popart session option
+  popart::SessionOptions popart_options;
+  popart::Patterns popart_patterns;
+
+ public:
+  void enablePattern(const std::string& t);
+  void disablePattern(const std::string& t);
+  const bool isPatternEnabled(const std::string& t);
 };

 }  // namespace ipu

--- a/paddle/fluid/platform/device/ipu/ipu_utils.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -12,22 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+#include <cmath>

 namespace paddle {
 namespace platform {
 namespace ipu {

-void* PaddleIArray::data() { return tensor_->data(); }
+void* PaddleIArray::data() { return tensor_.data(); }

 popart::DataType PaddleIArray::dataType() const {
-  return VarType2PopartType(tensor_->type());
+  return VarType2PopartType(tensor_.type());
 }

-std::size_t PaddleIArray::rank() const { return tensor_->dims().size(); }
+std::size_t PaddleIArray::rank() const { return tensor_.dims().size(); }

 int64_t PaddleIArray::dim(size_t index) const {
-  return tensor_->dims().at(index);
+  return tensor_.dims().at(index);
 }

 std::size_t PaddleIArray::nelms() const {
@@ -150,6 +151,32 @@ bool GetBoolEnv(std::string str) {
  }
 }

+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string& opt_type) {
+  // format: {popart_tensor_id, paddle_tensor_id}, ...
+  std::vector<std::pair<std::string, std::string>> pre_post_fix;
+
+  if (opt_type == "adam" || opt_type == "lamb") {
+    pre_post_fix.push_back(std::make_pair("", ""));
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
+    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+  } else if (opt_type == "sgd" || opt_type == "momentum") {
+    // sgd
+    pre_post_fix.push_back(std::make_pair("", ""));
+  } else {
+    pre_post_fix.push_back(std::make_pair("", ""));
+    //
+  }
+
+  return pre_post_fix;
+}
+
+int RequestIpus(const int num_ipus) {
+  // num_ipus must be pow(2, n);
+  return std::pow(2, ceil(log2(num_ipus)));
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -17,14 +17,27 @@ limitations under the License. */
 #include <popart/ndarraywrapper.hpp>
 #include <popart/tensordata.hpp>
 #include <popart/tensorinfo.hpp>
+#include <popart/vendored/any.hpp>

-#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace platform {
 namespace ipu {

+using float16 = platform::float16;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using Scope = framework::Scope;
+using OpDesc = framework::OpDesc;
+using Graph = framework::ir::Graph;
+using Node = framework::ir::Node;
+using BlockDesc = framework::BlockDesc;
+
 // onnx dtype
 // https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
 enum ONNXDataType : int {
@@ -49,14 +62,15 @@ enum ONNXDataType : int {

 class PaddleIArray final : public popart::IArray {
 public:
-  explicit PaddleIArray(framework::Tensor *tensor) : tensor_(tensor) {
+  explicit PaddleIArray(const Tensor* tensor) {
+    tensor_.ShareDataWith(*tensor);
    for (int i = 0; i < tensor->dims().size(); ++i) {
      shape_.push_back(tensor->dims().at(i));
    }
  }

 public:
-  void *data();
+  void* data();
  popart::DataType dataType() const;
  std::size_t rank() const;
  int64_t dim(size_t index) const;
@@ -64,7 +78,7 @@ class PaddleIArray final : public popart::IArray {
  const popart::Shape shape() const;

 private:
-  framework::Tensor *tensor_;
+  Tensor tensor_;
  std::vector<int64_t> shape_;
 };

@@ -74,8 +88,7 @@ popart::DataType OnnxDtype2PopartType(const int type);
 bool GetBoolEnv(std::string str);

 template <typename T>
-std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
-    const framework::Tensor &tensor) {
+std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(const Tensor& tensor) {
  auto dtype = VarType2PopartType(tensor.type());
  auto shape = std::vector<int64_t>();
  for (size_t i = 0; i < tensor.dims().size(); ++i) {
@@ -84,18 +97,140 @@ std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
  popart::TensorInfo tensor_info(dtype, shape);

  return std::make_unique<popart::NDArrayWrapper<T>>(
-      reinterpret_cast<T *>(tensor.data()), tensor_info);
+      reinterpret_cast<T*>(tensor.data()), tensor_info);
 }

 template <typename T>
 std::unique_ptr<popart::NDArrayWrapper<T>> LoDTensor2IArray(
-    framework::LoDTensor const &lod_tensor) {
-  PADDLE_ENFORCE_EQ(
-      lod_tensor.lod().size(), 0UL,
-      platform::errors::InvalidArgument("LoDTensor2IArray is Unimplemented"));
-  return Tensor2IArray<T>(lod_tensor);
+    LoDTensor const& lod_tensor) {
+  if (lod_tensor.lod().size() == 0) {
+    return Tensor2IArray<T>(lod_tensor);
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("LoDTensor2IArray is Unimplemented"));
+  }
+}
+
+template <typename T>
+T GetSingleVarFromScope(const Scope* scope, const std::string& var_name) {
+  auto var = scope->GetVar(var_name);
+  auto tensor = var->Get<framework::LoDTensor>();
+  // check dtype is  ?
+  return tensor.data<T>()[0];
 }

+struct CustomOpAttrVisitor : public boost::static_visitor<void> {
+  explicit CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
+                               const std::string& attr_name)
+      : attrs_(attr), attr_name_(attr_name) {}
+  mutable std::map<std::string, popart::any>* attrs_;
+  std::string attr_name_;
+
+  void operator()(int v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(float v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::string& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<int>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<float>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<std::string>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(bool v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<bool>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(BlockDesc* desc) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type."));
+  }
+  void operator()(const std::vector<BlockDesc*>& v) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type."));
+  }
+  void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<int64_t>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<double>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `boost::blank` type."));
+  }
+};
+
+struct IpuCustomOpIdentifier {
+  IpuCustomOpIdentifier(const std::string& _paddle_op,
+                        const std::string& _popart_op,
+                        const std::string& _domain, unsigned int _version)
+      : paddle_op(_paddle_op), popart_op(_domain, _popart_op, _version) {}
+
+  std::string repr() {
+    std::ostringstream os;
+    os << "paddle_op: " << paddle_op << ", domain: " << popart_op.domain
+       << ", type: " << popart_op.type << ", version: " << popart_op.version;
+    return os.str();
+  }
+
+  std::string paddle_op;
+  popart::OperatorIdentifier popart_op;
+};
+
+struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
+  explicit ConstantOpAttrVisitor(framework::LoDTensor* tensor,
+                                 framework::proto::VarType::Type dtype)
+      : tensor_(tensor), dtype_(dtype) {}
+  framework::LoDTensor* tensor_;
+  framework::proto::VarType::Type dtype_;
+
+  void operator()(const std::vector<int>& vec) const {
+    framework::TensorFromVector<int>(vec, tensor_);
+  }
+  void operator()(const std::vector<float>& vec) const {
+    if (dtype_ == framework::proto::VarType::FP16) {
+      std::vector<float16> vec_fp16;
+      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
+                     [](float f) -> float16 { return float16(f); });
+      framework::TensorFromVector<float16>(vec_fp16, tensor_);
+    } else {
+      framework::TensorFromVector<float>(vec, tensor_);
+    }
+  }
+  void operator()(const std::vector<bool>& vec) const {
+    framework::TensorFromVector<bool>(vec, tensor_);
+  }
+  void operator()(const std::vector<int64_t>& vec) const {
+    framework::TensorFromVector<int64_t>(vec, tensor_);
+  }
+  void operator()(const std::vector<double>& vec) const {
+    framework::TensorFromVector<double>(vec, tensor_);
+  }
+  void RaiseError() const {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Constant value must be a vector"));
+  }
+  void operator()(int v) const { RaiseError(); }
+  void operator()(float v) const { RaiseError(); }
+  void operator()(const std::string& v) const { RaiseError(); }
+  void operator()(const std::vector<std::string>& v) const { RaiseError(); }
+  void operator()(bool v) const { RaiseError(); }
+  void operator()(BlockDesc* desc) const { RaiseError(); }
+  void operator()(const std::vector<BlockDesc*>& v) const { RaiseError(); }
+  void operator()(int64_t v) const { RaiseError(); }
+  void operator()(boost::blank) const { RaiseError(); }
+};
+
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string& opt_type);
+
+int RequestIpus(const int num_ipus);
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/supported_ops_custom.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// clang-format off
+
+#pragma once
+
+OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT
+
+// clang-format on