From d67fe921f41689d5f34bacd01de33139e519a969 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Mon, 21 Mar 2022 10:29:38 +0800
Subject: [PATCH] [IPU] update ipu_backend (#40685)

* sync changes

* copy sOpNamescope

* fix UTs

* add authors

Co-authored-by: Xiaobing Wang <xiaobingw@graphcore.ai>
Co-authored-by: Allen Guo <alleng@graphcore.ai>
Co-authored-by: Zhixin Yao <zhixiny@graphcore.ai>
Co-authored-by: Zhaorui Chen <zhaoruic@graphcore.ai>
Co-authored-by: Han Zhao <hanzhao@graphcore.ai>

* fix code-format

* fix compile error

* add comments for feed_op

Co-authored-by: Xiaobing Wang <xiaobingw@graphcore.ai>
Co-authored-by: Zhixin Yao <zhixiny@graphcore.ai>
Co-authored-by: Zhaorui Chen <zhaoruic@graphcore.ai>
Co-authored-by: Han Zhao <hanzhao@graphcore.ai>
---
 .../ir/ipu/optimizer_extract_pass.cc          |  24 +-
 .../framework/ir/ipu/transfer_cast_op_pass.cc |   3 +-
 paddle/fluid/framework/tensor_util.cc         |  80 ++++--
 paddle/fluid/operators/controlflow/feed_op.cc |   7 +
 .../fluid/platform/device/ipu/CMakeLists.txt  |   2 +-
 .../fluid/platform/device/ipu/ipu_backend.cc  |  24 +-
 .../fluid/platform/device/ipu/ipu_backend.h   |  18 +-
 .../fluid/platform/device/ipu/ipu_compiler.cc | 252 ++++++++++++++++--
 .../fluid/platform/device/ipu/ipu_compiler.h  |  13 +
 .../fluid/platform/device/ipu/ipu_executor.cc |  76 +++---
 .../fluid/platform/device/ipu/ipu_executor.h  |  18 +-
 paddle/fluid/platform/device/ipu/ipu_names.h  |   2 +
 .../fluid/platform/device/ipu/ipu_strategy.cc |  42 ++-
 .../fluid/platform/device/ipu/ipu_strategy.h  |  53 +++-
 .../ipu/popart_canonicalization/op_builder.cc |  30 +--
 paddle/fluid/pybind/pybind.cc                 |  10 +
 .../unittests/ipu/test_ipu_strategy_ipu.py    |  16 +-
 .../tests/unittests/ipu/test_save_load_ipu.py |  12 +-
 18 files changed, 507 insertions(+), 175 deletions(-)
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 9fe50deaf2..7cdb7a8854 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -25,14 +25,14 @@ std::set<std::string> ignored_ops = {
     "sum",
     "clip",
     "clip_by_norm",
-    "square",
     "reduce_sum",
     "sqrt",
     "elementwise_max",
     "elementwise_div",
     "elementwise_mul",
-    "scale",   // adamax
-    "assign",  // adamw
+    "scale",           // adamax
+    "assign",          // adamw
+    "squared_l2_norm"  // gradient_clip_norm
 };
 
 const bool startswith(const std::string& str, const std::string& pre) {
@@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   new_op.SetAttr("with_lr_sched", false);
 
   std::set<std::string> set_ops{};
+  // save the weight decay tensor_name and weight_decay_value for Lamb
+  std::vector<std::string> weight_decay_vars{};
+  std::vector<float> weight_decay_values{};
+
   // use map store <op_type, op_ptr> ?
   for (auto* node : graph->Nodes()) {
     if (!node->IsOp()) {
@@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
     auto op_role = static_cast<OpRole>(op_role_);
 
     if (op_role == OpRole::kOptimize) {
+      // save weight decay value from every lamb optimizer op
+      if (op_type == "lamb" && op->HasAttr("weight_decay")) {
+        auto weight_decay_value =
+            BOOST_GET_CONST(float, op->GetAttr("weight_decay"));
+        auto params = op->Output("ParamOut");
+        weight_decay_vars.push_back(params[0]);
+        weight_decay_values.push_back(weight_decay_value);
+      }
+
       if (set_ops.count(op_type)) {
         continue;
       }
@@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   // seems with_lr_sched is always true
   new_op.SetAttr("with_lr_sched", true);
 
-  // setup weight deacy
+  // setup weight decay for Lamb
+  new_op.SetAttr("weight_decay_vars", weight_decay_vars);
+  new_op.SetAttr("weight_decay_values", weight_decay_values);
+
   // weight_decay/coeff is "scale" attr of scale_op
   if (set_ops.count("scale") && set_ops.count("sum")) {
     if (set_ops.count("sign")) {
diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
index e754ba72ad..5cd8358dc0 100644
--- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
@@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const {
 
   auto ipu_backend = platform::ipu::IpuBackend::GetInstance();
   auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16;
-  if (enable_fp16) {
+  auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op;
+  if (enable_fp16 && transfer_cast_op) {
     for (auto* node : graph->Nodes()) {
       if (node->IsOp() && node->Op()->Type() == "popart_cast") {
         if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) ==
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 5de8612354..e8cd84248e 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_ipu_place(src_place) &&
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
         "Copying from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
 template <typename TENSOR>
@@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
+  }                                                // NOLINT
   else if (platform::is_cpu_place(src_place) &&    // NOLINT
            platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
+  }                                                 // NOLINT
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_custom_place(
                dst_place)) { /* custom_device -> custom_device*/
@@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
+  }                                              // NOLINT
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
+  }                                              // NOLINT
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
@@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
       auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
       xpu_ctx->Wait();
     }
-  }
+  }       // NOLINT
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
@@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
 template <typename Predicate, typename DevCtx>
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index bc29c92b09..8a190c1a1e 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor<void> {
         out_var_->GetMutable<framework::LoDTensor>();
     if (platform::is_same_place(in_tensor.place(), place_)) {
       out_tensor->ShareDataWith(in_tensor);
+#ifdef PADDLE_WITH_IPU
+    } else if (platform::is_ipu_place(place_)) {
+      // For ipu, both in_tensor and out_tensor are allocated on cpu,
+      // PopART will copy tensor from host automatically,
+      // no TensorCopy() is required here.
+      out_tensor->ShareDataWith(in_tensor);
+#endif
     } else {
       platform::DeviceContext *context =
           platform::DeviceContextPool::Instance().Get(place_);
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index acf914c508..42c949f7fe 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -13,7 +13,7 @@ IF(WITH_IPU)
     "ipu_device.cc"
   )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist)
   cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
   add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC})
   add_dependencies(paddle_ipu ipu_backend)
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index e0b3b08a23..012294d0ff 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -32,6 +32,7 @@ IpuBackend* IpuBackend::GetInstance() {
 IpuBackend::IpuBackend() {
   compiler_ = std::make_unique<Compiler>();
   executor_ = std::make_unique<Executor>();
+  timer_ = std::make_unique<platform::Timer>();
 }
 
 IpuBackend::~IpuBackend() {
@@ -43,6 +44,7 @@ void IpuBackend::Compile(Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
+  is_compiled_ = false;
   compiler_->Prepare(graph);
   compiler_->InitInputs(feed_list);
   compiler_->LowerConstants(scope_);
@@ -52,31 +54,25 @@ void IpuBackend::Compile(Graph* graph,
   if (ipu_strategy_->is_training) {
     compiler_->LowerOptimizer(scope_);
   }
+  if (!ipu_strategy_->onnx_dump_path.empty()) {
+    SaveModelProto(ipu_strategy_->onnx_dump_path);
+  }
   executor_->SetCompilerResources(compiler_->GetResources());
-
+  executor_->Prepare(compiler_->GetModelProto());
   is_compiled_ = true;
-  // when call compile, means a new graph
-  is_prepared_ = false;
   VLOG(10) << "leave IpuBackend::Compile";
 }
 
 void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
                      const std::vector<Tensor*>& outputs,
                      const framework::ExecutionContext& ctx) {
-  Prepare();
   timer_->Start();
   executor_->Run(inputs, outputs, ctx);
   timer_->Pause();
   VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)";
 }
 
-void IpuBackend::Prepare() {
-  if (!is_prepared_) {
-    executor_->Prepare(compiler_->GetModelProto());
-    timer_.reset(new platform::Timer());
-    is_prepared_ = true;
-  }
-}
+void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); }
 
 void IpuBackend::Detach() { executor_->Detach(); }
 
@@ -101,12 +97,10 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
 }
 
 void IpuBackend::SaveModelProto(const std::string& path) {
-  if (ipu_strategy_->is_training && is_prepared_) {
+  if (ipu_strategy_->is_training && is_compiled_) {
     executor_->SaveModelToHost(path);
-  } else if (is_compiled_) {
-    compiler_->SaveModelProtoNoCheck(path);
   } else {
-    LOG(WARNING) << "Model is empty";
+    compiler_->SaveModelProtoNoCheck(path);
   }
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index 1244192490..0578d9face 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -60,6 +60,9 @@ class IpuBackend {
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
+  // Sync weights from IPU while training
+  void WeightsToHost();
+
   // detach IPU manually
   void Detach();
 
@@ -76,22 +79,17 @@ class IpuBackend {
   void SaveModelProto(const std::string &path);
 
  private:
-  void Prepare();
-
- private:
-  std::unique_ptr<Compiler> compiler_;
-  std::unique_ptr<Executor> executor_;
-  bool is_compiled_ = false;
-  bool is_prepared_ = false;
-
   // not own
   const Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
 
- private:
-  // time record for IpuBackend::Run
+  // own
+  std::unique_ptr<Compiler> compiler_;
+  std::unique_ptr<Executor> executor_;
   std::unique_ptr<platform::Timer> timer_;
 
+  bool is_compiled_ = false;
+
   DISABLE_COPY_AND_ASSIGN(IpuBackend);
 };
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index cdb3f6f9b3..1a3e600058 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -18,6 +18,7 @@
 #include <popart/adaptive.hpp>
 #include <popart/optimizer.hpp>
 #include <popart/sgd.hpp>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
@@ -25,13 +26,20 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-popart::AdamMode AdamModeFromStr(const std::string& str) {
+popart::AdamMode AdamModeFromStr(const std::string& str,
+                                 const bool& use_no_bias_optimizer) {
   if (str == "adam") {
-    return popart::AdamMode::Adam;
+    if (!use_no_bias_optimizer)
+      return popart::AdamMode::Adam;
+    else
+      return popart::AdamMode::AdamNoBias;
   } else if (str == "adamax") {
     return popart::AdamMode::AdaMax;
   } else if (str == "lamb") {
-    return popart::AdamMode::Lamb;
+    if (!use_no_bias_optimizer)
+      return popart::AdamMode::Lamb;
+    else
+      return popart::AdamMode::LambNoBias;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Uknown AdamMode: %s, AdamMode must be one of these values: adam, "
@@ -70,6 +78,17 @@ popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) {
   }
 }
 
+popart::DataType DataTypeFromStr(const std::string& str) {
+  if (str == "FLOAT") {
+    return popart::DataType::FLOAT;
+  } else if (str == "FLOAT16") {
+    return popart::DataType::FLOAT16;
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Unsupported DataType: %s", str));
+  }
+}
+
 template <typename T>
 T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
   if (op_desc->HasAttr(attr)) {
@@ -122,6 +141,17 @@ void Compiler::Prepare(const Graph* graph) {
   builder_ = popart::Builder::create();
   resources_ = std::make_unique<CompilerResources>();
   graph_helper_ = std::make_unique<GraphHelper>(graph);
+  // Set the flag of set_amp_for_all_
+  for (auto* node : graph_helper_->sorted_ops) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    if (op_type == "popart_matmul") {
+      if (op_desc->HasAttr(sAvailMemAttribute)) {
+        set_amp_for_all_ = false;
+        return;
+      }
+    }
+  }
 }
 
 void Compiler::RegisterOpFunc() {
@@ -155,7 +185,9 @@ void Compiler::RegisterOpFunc() {
      auto debug_context = BuildDebugContext(op_desc);         \
      auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1();   \
      auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
+     PushNameScope(op_desc);                                  \
      auto output_ids = OnnxImpl(inputs Args, debug_context);  \
+     PopNameScope(op_desc);                                   \
      SetIpuIndexStage(output_ids, op_desc);                   \
      SetAMPAttributes(output_ids, op_desc);                   \
      SetSerializeAttributes(output_ids, op_desc);             \
@@ -241,7 +273,9 @@ void Compiler::LowerConstants(const Scope* scope) {
       popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()),
                                      shape);
       const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
+      PushNameScope(op_desc);
       popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      PopNameScope(op_desc);
       SetIpuIndexStage(result, op_desc);
       resources_->tensors.emplace(tensor_name, result);
     }
@@ -261,6 +295,10 @@ void Compiler::LowerWeights(const Scope* scope) {
           VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
           continue;
         }
+        if (var_name.rfind("learning_rate", 0) == 0) {
+          VLOG(10) << "skip learning_rate_var: " << var_name;
+          continue;
+        }
         VLOG(10) << "lowering weight: " << var_name;
 
         auto var = scope->FindVar(var_name);
@@ -273,10 +311,15 @@ void Compiler::LowerWeights(const Scope* scope) {
           }
           popart::TensorInfo tensor_info(dtype, shape);
           popart::ConstVoidData const_data{tensor.data(), tensor_info};
-          popart::TensorId result =
-              builder_->addInitializedInputTensor(const_data, var_name);
-          resources_->tensors.emplace(var_name, result);
-          resources_->weights.push_back(result);
+          if (!node->outputs.empty()) {
+            auto op_node = node->outputs[0];
+            PushNameScope(op_node->Op());
+            popart::TensorId result =
+                builder_->addInitializedInputTensor(const_data, var_name);
+            PopNameScope(op_node->Op());
+            resources_->tensors.emplace(var_name, result);
+            resources_->weights.push_back(var_name);
+          }
         }
       }
     }
@@ -298,7 +341,10 @@ void Compiler::LowerBody() {
     } else if (op_type == "popart_checkpointoutput") {
       auto inputs = GetOpInputs(op_desc);
       auto outputs = GetOpOutputs(op_desc);
+      PushNameScope(op_desc);
       auto output_ids = builder_->checkpointOutput(inputs);
+      PopNameScope(op_desc);
+      SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else if (op_type == "popart_custom_op") {
       auto inputs = GetOpInputs(op_desc);
@@ -313,9 +359,11 @@ void Compiler::LowerBody() {
           BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
       VLOG(10) << "Build graph from custom op: " << __op_type;
       auto it = custom_ops_.find(__op_type);
+      PushNameScope(op_desc);
       auto output_ids =
           builder_->customOp(it->second.popart_op, it->second.popart_op.version,
                              inputs, outputs.size(), attributes, debug_context);
+      PopNameScope(op_desc);
       SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else if (op_type == "popart_printtensor") {
@@ -325,8 +373,10 @@ void Compiler::LowerBody() {
       auto print_gradient =
           BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
       auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      PushNameScope(op_desc);
       auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
           inputs, print_gradient, debug_context, title);
+      PopNameScope(op_desc);
       SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else {
@@ -367,8 +417,31 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         resources_->with_lr_sched = false;
       }
       VLOG(10) << "Set initial lr: " << resources_->lr;
-      auto loss_scaling = ipu_strategy_->loss_scaling;
+
+      // Get the type of optimizer
       auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type"));
+      // Set weight decay by tensor names for Lamb
+      auto weight_decay_vars = BOOST_GET_CONST(
+          std::vector<std::string>, op_desc->GetAttr("weight_decay_vars"));
+      auto weight_decay_values = BOOST_GET_CONST(
+          std::vector<float>, op_desc->GetAttr("weight_decay_values"));
+      // Get the maximum permissible value for gradient clipping
+      std::vector<popart::ClipNormSettings> clip_norm_settings = {};
+      if (op_desc->HasAttr("clip_norm")) {
+        auto clip_norm = BOOST_GET_CONST(float, op_desc->GetAttr("clip_norm"));
+        clip_norm_settings.push_back(
+            popart::ClipNormSettings::clipAllWeights(clip_norm));
+        VLOG(10) << "Set the global gradient clipping with the maximum "
+                    "permissible value: "
+                 << clip_norm;
+      }
+
+      // Values from ipu_strategy
+      auto loss_scaling = ipu_strategy_->loss_scaling;
+      auto accl1_type = DataTypeFromStr(ipu_strategy_->accl1_type);
+      auto accl2_type = DataTypeFromStr(ipu_strategy_->accl2_type);
+      auto accl3_type = DataTypeFromStr(ipu_strategy_->accl3_type);
+
       if (type == "sgd") {
         auto weight_decay =
             BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
@@ -376,12 +449,18 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         resources_->optimizer_fn = [=](float lr) {
           return std::make_unique<popart::SGD>(
               popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(weight_decay, false),
               popart::OptimizerValue(momentum, true),
               popart::SGD::getUnsetDampening(),
               popart::SGD::getUnsetVelocityScaling(),
-              popart::OptimizerValue(loss_scaling, true));
+              popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
         };
+        resources_->eval_optimizer = std::make_unique<popart::SGD>(
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(),
+            popart::SGD::getUnsetVelocityScaling(),
+            popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
       } else if (type == "adam") {
         auto weight_decay =
             BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
@@ -392,22 +471,79 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         VLOG(10) << "set max_weight_norm: " << mwn;
         auto adam_mode_ =
             BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode"));
-        auto adam_mode = AdamModeFromStr(adam_mode_);
-        auto weight_decay_mode_ =
-            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto adam_mode =
+            AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer);
+        auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        if (weight_decay_mode_.empty()) {
+          weight_decay_mode_ = BOOST_GET_CONST(
+              std::string, op_desc->GetAttr("weight_decay_mode"));
+        }
         auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
         resources_->optimizer_fn = [=](float lr) {
-          return std::make_unique<popart::Adam>(
-              popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
-              popart::OptimizerValue(beta1, true),
-              popart::OptimizerValue(beta2, true),
+          if (adam_mode == popart::AdamMode::Lamb ||
+              adam_mode == popart::AdamMode::LambNoBias) {
+            const std::map<std::string, std::pair<float, bool>>
+                optimizer_value = {{"defaultLearningRate", {lr, false}},
+                                   {"defaultBeta1", {beta1, false}},
+                                   {"defaultBeta2", {beta2, false}},
+                                   {"defaultEps", {eps, true}},
+                                   {"lossScaling", {loss_scaling, true}},
+                                   {"defaultMaxWeightNorm", {mwn, true}}};
+            auto optimizer_instance = std::make_unique<popart::Adam>(
+                optimizer_value, adam_mode, weight_decay_mode,
+                popart::DataType::UNDEFINED, accl1_type, accl2_type,
+                clip_norm_settings);
+            for (int i = 0; i < weight_decay_vars.size(); i++) {
+              optimizer_instance->insertSpecific(
+                  weight_decay_vars[i],
+                  {{"weightDecay", {weight_decay_values[i], false}}});
+              VLOG(10) << "Set Tensor " << weight_decay_vars[i]
+                       << " weight decay as " << weight_decay_values[i];
+            }
+            return optimizer_instance;
+          } else {
+            return std::make_unique<popart::Adam>(
+                popart::OptimizerValue(lr, false),
+                popart::OptimizerValue(weight_decay, false),
+                popart::OptimizerValue(beta1, false),
+                popart::OptimizerValue(beta2, false),
+                popart::OptimizerValue(eps, true),
+                popart::OptimizerValue(loss_scaling, true),
+                popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
+                popart::DataType::UNDEFINED, accl1_type, accl2_type,
+                clip_norm_settings);
+          }
+        };
+        if (adam_mode == popart::AdamMode::Lamb ||
+            adam_mode == popart::AdamMode::LambNoBias) {
+          const std::map<std::string, std::pair<float, bool>> optimizer_value =
+              {{"defaultLearningRate", {0.0, false}},
+               {"defaultBeta1", {beta1, false}},
+               {"defaultBeta2", {beta2, false}},
+               {"defaultEps", {eps, true}},
+               {"lossScaling", {loss_scaling, true}},
+               {"defaultMaxWeightNorm", {mwn, true}}};
+          auto eval_optimizer = std::make_unique<popart::Adam>(
+              optimizer_value, adam_mode, weight_decay_mode,
+              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
+              popart::DataType::FLOAT, clip_norm_settings);
+          for (int i = 0; i < weight_decay_vars.size(); i++) {
+            eval_optimizer->insertSpecific(weight_decay_vars[i],
+                                           {{"weightDecay", {0.0, false}}});
+          }
+          resources_->eval_optimizer = std::move(eval_optimizer);
+        } else {
+          resources_->eval_optimizer = std::make_unique<popart::Adam>(
+              popart::OptimizerValue(0.0, false),
+              popart::OptimizerValue(0.0, false),
+              popart::OptimizerValue(beta1, false),
+              popart::OptimizerValue(beta2, false),
               popart::OptimizerValue(eps, true),
               popart::OptimizerValue(loss_scaling, true),
               popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
               popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT);
-        };
+              popart::DataType::FLOAT, clip_norm_settings);
+        }
       } else if (type == "adaptive") {
         auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
         auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
@@ -417,21 +553,33 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         auto adaptive_mode_ =
             BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode"));
         auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_);
-        auto weight_decay_mode_ =
-            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        if (weight_decay_mode_.empty()) {
+          weight_decay_mode_ = BOOST_GET_CONST(
+              std::string, op_desc->GetAttr("weight_decay_mode"));
+        }
         auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
         resources_->optimizer_fn = [=](float lr) {
           return std::make_unique<popart::Adaptive>(
               popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(weight_decay, false),
               popart::OptimizerValue(alpha, true),
               popart::OptimizerValue(momentum, true),
               popart::OptimizerValue(eps, true),
               popart::OptimizerValue(loss_scaling, true), adaptive_mode,
-              weight_decay_mode, popart::DataType::UNDEFINED,
-              popart::DataType::FLOAT, popart::DataType::FLOAT,
-              popart::DataType::FLOAT);
+              weight_decay_mode, popart::DataType::UNDEFINED, accl1_type,
+              accl2_type, accl3_type);
         };
+        resources_->eval_optimizer = std::make_unique<popart::Adaptive>(
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(alpha, true),
+            popart::OptimizerValue(momentum, true),
+            popart::OptimizerValue(eps, true),
+            popart::OptimizerValue(loss_scaling, true), adaptive_mode,
+            weight_decay_mode, popart::DataType::UNDEFINED,
+            popart::DataType::FLOAT, popart::DataType::FLOAT,
+            popart::DataType::UNDEFINED);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "optimizer %s is not implemented", type));
@@ -510,9 +658,32 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
                                 const OpDesc* op_desc) {
   VLOG(10) << "enter Compiler::SetAMPAttributes";
   if (op_desc->Type() == "popart_matmul") {
-    auto amp = ipu_strategy_->available_memory_proportion;
-    if (amp > 0.0f && amp <= 1.0) {
-      builder_->setAvailableMemoryProportion(tensor_id, amp);
+    if (set_amp_for_all_) {
+      auto amp = ipu_strategy_->available_memory_proportion;
+      if (amp < 0.0f || amp > 1.0) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "AvailableMemoryProportion %f is invalid, which should be set 0 <= "
+            "amp <= 1",
+            amp));
+      }
+      if (amp > 0.0f) {
+        builder_->setAvailableMemoryProportion(tensor_id, amp);
+      }
+    } else {
+      if (op_desc->HasAttr(sAvailMemAttribute)) {
+        auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute));
+        if (amp < 0.0f || amp > 1.0) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "AvailableMemoryProportion %f is invalid, which should be set 0 "
+              "<= amp <= 1",
+              amp));
+        }
+        if (amp > 0.0f) {
+          builder_->setAvailableMemoryProportion(tensor_id, amp);
+          VLOG(10) << "set available_memory_proportion for tensor: "
+                   << tensor_id << " as " << amp;
+        }
+      }
     }
   }
   VLOG(10) << "leave Compiler::SetAMPAttributes";
@@ -602,6 +773,29 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
   return popart::DebugContext(op_identify_id);
 }
 
+void Compiler::PushNameScope(const OpDesc* op) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope == "/") {
+    return;
+  }
+  if (!op_namescope.empty()) {
+    op_namescope.pop_back();
+  }
+  if (!op_namescope.empty()) {
+    op_namescope.erase(op_namescope.begin());
+  }
+  VLOG(10) << "name_scope is: " << op_namescope;
+  builder_->pushNameScope(op_namescope);
+}
+
+void Compiler::PopNameScope(const OpDesc* op) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope == "/") {
+    return;
+  }
+  builder_->popNameScope();
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 5d1e8c2727..2d00970bf1 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -50,6 +50,8 @@ struct CompilerResources {
   using OptimizerFn =
       std::function<std::unique_ptr<popart::Optimizer>(float lr)>;
   OptimizerFn optimizer_fn;
+  // The eval mode of optimizer in training
+  std::unique_ptr<popart::Optimizer> eval_optimizer;
 
  public:
   popart::Optimizer *Optimizer() { return optimizer.get(); }
@@ -110,6 +112,7 @@ class Compiler {
   void RegisterOpFunc();
   std::vector<std::string> GetOpInputs(const OpDesc *op);
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
+  const std::string GetNameScope(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
 
   void InsertTensors(const std::vector<std::string> &output_names,
@@ -126,6 +129,8 @@ class Compiler {
                               const OpDesc *op_desc);
   void SetSerializeAttributes(const std::string &tensor_id,
                               const OpDesc *op_desc);
+  void PushNameScope(const OpDesc *op);
+  void PopNameScope(const OpDesc *op);
 
  private:
   std::unique_ptr<popart::Builder> builder_;
@@ -137,6 +142,14 @@ class Compiler {
 
   const IpuStrategy *ipu_strategy_ = nullptr;
   std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
+
+  // Used to choose the way to set amp for Ops
+  // If anyone op has the attr sAvailMemAttribute, the
+  // available_memory_proportion from ipu_strategy
+  // will be ignored and the Ops are set by their own sAvailMemAttribute. Else,
+  // all relevant Ops will be set by
+  // the available_memory_proportion from ipu_strategy.
+  bool set_amp_for_all_ = true;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index c124d58957..649b291244 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -64,15 +64,10 @@ void Executor::Prepare(const std::string &proto) {
   WeightsFromPaddle();
   VLOG(10) << "Copy weights from paddle to popart...done";
 
-  VLOG(10) << "Copy weights from host to device...";
-  session_->weightsFromHost();
-  VLOG(10) << "Copy weights from host to device...done";
-
-  if (ipu_strategy_->save_init_onnx) {
-    session_->modelToHost("test_init.onnx");
+  if (ipu_strategy_->random_seed != std::numeric_limits<std::uint64_t>::max()) {
+    VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed;
+    session_->setRandomSeed(ipu_strategy_->random_seed);
   }
-  // init run step
-  step_ = 0;
 }
 
 void Executor::Run(const std::vector<const Tensor *> &inputs,
@@ -120,11 +115,17 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "Prepared inputs/anchors";
 
   if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
-    VLOG(10) << "Update learning_rate";
-    auto new_lr =
-        GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
-    VLOG(10) << "New Lr: " << new_lr;
-    auto *optimizer = compiler_resources_->UpdateOptimizer(new_lr);
+    popart::Optimizer *optimizer;
+    if (ipu_strategy_->runtime_options.enable_eval) {
+      VLOG(10) << "Switch optimizer to eval mode";
+      optimizer = compiler_resources_->eval_optimizer.get();
+    } else {
+      VLOG(10) << "Update learning_rate";
+      auto new_lr =
+          GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
+      VLOG(10) << "New Lr: " << new_lr;
+      optimizer = compiler_resources_->UpdateOptimizer(new_lr);
+    }
     auto *session = dynamic_cast<popart::TrainingSession *>(session_.get());
     session->updateOptimizerFromHost(optimizer);
   }
@@ -133,15 +134,13 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "Running...";
   session_->run(stepio);
   VLOG(10) << "Running...done";
+}
 
-  step_++;
-  if (ipu_strategy_->is_training &&
-      step_ % ipu_strategy_->save_per_n_step == 0) {
-    session_->weightsToHost();
+void Executor::WeightsToHost() {
+  if (ipu_strategy_->is_training && session_) {
     WeightsToPaddle();
-    if (ipu_strategy_->save_onnx_checkpoint) {
-      session_->modelToHost("test_last" + std::to_string(step_) + ".onnx");
-    }
+  } else {
+    LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU.";
   }
 }
 
@@ -153,6 +152,7 @@ void Executor::AcquireDevice() {
   }
 
   bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  bool enable_distribution = ipu_strategy_->enable_distribution;
   if (use_ipu_model) {
     std::map<std::string, std::string> deviceOpts{
         {
@@ -162,6 +162,16 @@ void Executor::AcquireDevice() {
     };
     device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
         deviceOpts);
+  } else if (enable_distribution) {
+    auto ipus_per_replica = ipu_strategy_->num_ipus /
+                            ipu_strategy_->popart_options.replicatedGraphCount;
+    auto device_id = popdist_get_device(ipus_per_replica);
+    device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
+        device_id);
+    PADDLE_ENFORCE_NOT_NULL(
+        device_, platform::errors::Unavailable(
+                     "Can't attach IPU in distribution, ipu_num = %d.",
+                     RequestIpus(ipu_strategy_->num_ipus)));
   } else {
     device_ =
         popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
@@ -185,28 +195,29 @@ void Executor::SetWeightsIO() {
   auto opt_type = compiler_resources_->optimizer_type;
   VLOG(10) << "SetWeightsIO for " << opt_type;
   auto pre_post_fix = GetOptPrePostfix(opt_type);
-  for (const auto &weight_id : compiler_resources_->weights) {
+  for (const auto &weight_pd : compiler_resources_->weights) {
     for (const auto &pair : pre_post_fix) {
       // pair.first : popart prefix, pair.second : paddle postfix
-      auto popart_var_name = pair.first + weight_id;
-      auto paddle_var_name = weight_id + pair.second;
+      auto weight_pop = compiler_resources_->tensors[weight_pd];
+      auto popart_var = pair.first + weight_pop;
+      auto paddle_var = weight_pd + pair.second;
 
-      if (scope_->FindVar(paddle_var_name) == nullptr) {
+      if (scope_->FindVar(paddle_var) == nullptr) {
         continue;
       }
-
-      if (!session_->hasInfo(popart_var_name)) {
+      if (!session_->hasInfo(popart_var)) {
         continue;
       }
 
-      auto var = scope_->GetVar(paddle_var_name);
+      VLOG(10) << "Connect paddle weight: " << paddle_var
+               << " with popart weight: " << popart_var;
+      auto var = scope_->GetVar(paddle_var);
       auto data_ptr = var->GetMutable<framework::LoDTensor>()->data();
-
-      auto tensor_info = session_->getInfo(popart_var_name);
-      executor_resources_->weights_io.insert(popart_var_name,
+      auto tensor_info = session_->getInfo(popart_var);
+      executor_resources_->weights_io.insert(popart_var,
                                              {data_ptr, tensor_info});
       executor_resources_->weights_and_opt_state.emplace_back(
-          std::make_pair(popart_var_name, paddle_var_name));
+          std::make_pair(popart_var, paddle_var));
     }
   }
 }
@@ -284,6 +295,7 @@ void Executor::ConvertWeights(bool align_to_popart) {
 void Executor::WeightsFromPaddle() {
   ConvertWeights(true);
   session_->writeWeights(executor_resources_->weights_io);
+  session_->weightsFromHost();
 }
 
 // |-----------------------------------------------------|
@@ -297,13 +309,13 @@ void Executor::WeightsFromPaddle() {
 // Paddle -> halfToFloat: cast then save to paddle
 // Popart -> Paddle: copy from paddle to popart
 void Executor::WeightsToPaddle() {
+  session_->weightsToHost();
   session_->readWeights(executor_resources_->weights_io);
   ConvertWeights(false);
 }
 
 void Executor::SaveModelToHost(const std::string &path) {
   if (session_) {
-    session_->weightsToHost();
     WeightsToPaddle();
     session_->modelToHost(path);
   } else {
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index b08b94b45f..c59e623ab2 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/session.hpp>
 #include <popart/tensorinfo.hpp>
+#include <popdist/popdist_poplar.hpp>
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
@@ -36,8 +37,7 @@ struct ExecutorResources {
   // map<tensor_id, paddle_var_ptr>
   popart::WeightsIO weights_io;
   // <popart_var, paddle_var> pairs, include weights and optimizer states
-  std::vector<std::pair<popart::TensorId, popart::TensorId>>
-      weights_and_opt_state;
+  std::vector<std::pair<popart::TensorId, std::string>> weights_and_opt_state;
 };
 
 class Executor {
@@ -53,14 +53,12 @@ class Executor {
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
+  // sync weights from popart to paddle
+  void WeightsToHost();
+
   // detach IPU
   void Detach();
 
-  void SetWeightsIO();
-  void ConvertWeights(bool align_to_popart);
-  void WeightsFromPaddle();
-  void WeightsToPaddle();
-
   // Scope
   void SetScope(const Scope *scope) { scope_ = scope; }
 
@@ -79,6 +77,10 @@ class Executor {
 
  private:
   void AcquireDevice();
+  void SetWeightsIO();
+  void ConvertWeights(bool);
+  void WeightsFromPaddle();
+  void WeightsToPaddle();
 
  private:
   // not own
@@ -92,8 +94,6 @@ class Executor {
   std::unique_ptr<popart::Session> session_;
   // one OneSession means a graph
   std::unique_ptr<ExecutorResources> executor_resources_;
-
-  int step_ = 0;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_names.h b/paddle/fluid/platform/device/ipu/ipu_names.h
index a809a8c6e5..b8a6ceffb5 100644
--- a/paddle/fluid/platform/device/ipu/ipu_names.h
+++ b/paddle/fluid/platform/device/ipu/ipu_names.h
@@ -24,6 +24,8 @@ static constexpr const char *sIpuIndexAttr = "ipu_index";
 static constexpr const char *sIpuStageAttr = "ipu_stage";
 static constexpr const char *sMatmulSerializeFactor = "serialize_factor";
 static constexpr const char *sMatmulSerializeMode = "serialize_mode";
+static constexpr const char *sAvailMemAttribute = "__available_memory";
+static constexpr const char *sOpNamescope = "op_namescope";
 static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
 static constexpr const char *sDebugInfoId = "__debug_info_id";
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index e806b0b30e..6172d4d7dc 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -62,23 +62,40 @@ IpuStrategy::IpuStrategy() {
                  [&]() { return name; })
 
   ADD_BOOL_OPTION(is_training);
-  ADD_BOOL_OPTION(save_init_onnx);
-  ADD_BOOL_OPTION(save_onnx_checkpoint);
   ADD_BOOL_OPTION(need_avg_shard);
   ADD_BOOL_OPTION(enable_fp16);
+  ADD_BOOL_OPTION(transfer_cast_op);
+  ADD_BOOL_OPTION(use_no_bias_optimizer);
+  ADD_BOOL_OPTION(enable_distribution);
   ADD_UINT64_OPTION(num_ipus);
   ADD_UINT64_OPTION(batches_per_step);
   ADD_UINT64_OPTION(micro_batch_size);
-  ADD_UINT64_OPTION(save_per_n_step);
+  ADD_UINT64_OPTION(random_seed);
   ADD_DOUBLE_OPTION(available_memory_proportion);
   ADD_DOUBLE_OPTION(loss_scaling);
   ADD_DOUBLE_OPTION(max_weight_norm);
+  ADD_STRING_OPTION(accl1_type);
+  ADD_STRING_OPTION(accl2_type);
+  ADD_STRING_OPTION(accl3_type);
+  ADD_STRING_OPTION(onnx_dump_path);
+  ADD_STRING_OPTION(weight_decay_mode);
 
 #undef ADD_STRING_OPTION
 #undef ADD_DOUBLE_OPTION
 #undef ADD_UINT64_OPTION
 #undef ADD_BOOL_OPTION
 
+#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name)                          \
+  RegisterSetter(bool_options, #name,                                        \
+                 [&](bool value) { runtime_options.aliased_name = value; }); \
+  RegisterGetter(options_getter, options_type, #name, "bool", [&]() {        \
+    return std::to_string(runtime_options.aliased_name);                     \
+  })
+
+  ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval);
+
+#undef ADD_RUNTIME_BOOL_OPTION
+
 #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType)        \
   RegisterSetter(uint64_options, #name, [&](std::uint64_t value) {        \
     PADDLE_ENFORCE_LT(                                                    \
@@ -171,6 +188,7 @@ IpuStrategy::IpuStrategy() {
   ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold,
                                  mergeVarUpdateMemThreshold);
   ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak);
+  ADD_POPART_UINT64_OPTION_ALIAS(replicated_graph_count, replicatedGraphCount);
   ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor);
   ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler);
   ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor,
@@ -462,12 +480,30 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   } else if (opt == "use_io_tiles_to_store") {
     settings->location.storageTileSet =
         value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
+  } else if (opt == "sharding_domain_with_all") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::All, value);
+  } else if (opt == "sharding_domain_with_consecutive") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::Consecutive, value);
+  } else if (opt == "sharding_domain_with_orthogonal") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::Orthogonal, value);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Unknown option ' %s' for tensor location: %s", opt, tensor));
   }
 }
 
+void IpuStrategy::SetAccumulateOuterFragmentSettings(
+    const std::uint64_t& schedule, const std::vector<int>& values) {
+  VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
+  auto schedule_ =
+      static_cast<popart::AccumulateOuterFragmentSchedule>(schedule);
+  popart_options.accumulateOuterFragmentSettings =
+      popart::AccumulateOuterFragmentSettings(schedule_, values);
+}
+
 void IpuStrategy::AddCustomOp(const std::string& paddle_op,
                               const std::string& popart_op,
                               const std::string& domain, int version) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 571fb1e163..786e2419cc 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -24,6 +24,11 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
+struct RuntimeOptions {
+  // enable the eval mode in training by switching optimizers.
+  bool enable_eval = false;
+};
+
 class IpuStrategy {
  public:
   IpuStrategy();
@@ -32,19 +37,24 @@ class IpuStrategy {
   // training flag, true for training
   bool is_training = true;
 
-  // save the onnx model lowered by paddle program description
-  bool save_init_onnx = false;
-
-  // save the trained model
-  bool save_onnx_checkpoint = false;
-
   // average sharding, debugging used
   bool need_avg_shard = false;
 
   // flag for fp16, true for pure fp16
   bool enable_fp16 = false;
 
-  // Number ipus total needed, replica * ipu_per_replica
+  // enable transfer cast Op target from fp32 to fp16 in fp16 mode
+  bool transfer_cast_op = true;
+
+  // The mode of Adam/Lamb optimizer
+  // false: The standard Adam/Lamb optimizer
+  // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
+  bool use_no_bias_optimizer = false;
+
+  // enable distributed computing for POD128 or POD256
+  bool enable_distribution = false;
+
+  // Number ipus total needed, local_replica * ipu_per_replica
   int num_ipus = 1;
 
   // batches per step
@@ -53,8 +63,8 @@ class IpuStrategy {
   // micro batch-size
   int micro_batch_size = 1;
 
-  // save paddle model per n steps
-  int save_per_n_step = 1;
+  // random seed
+  std::uint64_t random_seed = std::numeric_limits<std::uint64_t>::max();
 
   // TODO(alleng) remove this param
   // available memory proportion, 0.0f for disable
@@ -67,6 +77,29 @@ class IpuStrategy {
   // defaultMaxWeightNorm for adam optimizer
   float max_weight_norm = 65504.0f;
 
+  // file path for dumping compiled model in onnx format
+  std::string onnx_dump_path;
+
+  // Data type to use for tensor that stores first-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl1_type = "FLOAT";
+
+  // Data type to use for tensor that stores second-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl2_type = "FLOAT";
+
+  // Data type to use for tensor that stores third-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl3_type = "FLOAT";
+
+  // WeightDecayMode for setting the optimizer
+  // if set, it will override other settings
+  // value must be one of "decay" or "l2_regularization" or not set
+  std::string weight_decay_mode = "";
+
+  // Runtime Options
+  RuntimeOptions runtime_options;
+
   // popart session option
   popart::SessionOptions popart_options;
 
@@ -86,6 +119,8 @@ class IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
+                                          const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
                    const std::string &domain, int version);
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 3ec1999edc..0339097d58 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -32,30 +32,10 @@ const std::string GenerateOpName() {
 
 const std::string CreateOpIdentifyId(Node *node) {
   // format:
-  //   if has custom op_namescope:
-  //      {op_namescope}/op_type/_gen_*
-  //   else:
-  //     {op_type}/{out_var0}/{out_var1}/.../_gen_*
+  //   op_type/_gen_*
   // this name will be used as op name when exporting onnx model from popart
   auto op_type = node->Name();
-  std::string op_namescope;
-  if (node->Op()->HasAttr("op_namescope")) {
-    op_namescope =
-        BOOST_GET_CONST(std::string, node->Op()->GetAttr("op_namescope"));
-  } else {
-    op_namescope = "/";
-  }
-
-  if (op_namescope != "/") {
-    return {op_namescope + op_type + "/" + GenerateOpName()};
-  } else {
-    std::string op_out = "";
-    for (auto *out_node : node->outputs) {
-      op_out += "/";
-      op_out += out_node->Name();
-    }
-    return {op_type + op_out + "/" + GenerateOpName()};
-  }
+  return {op_type + "/" + GenerateOpName()};
 }
 
 Node *MakeVarNode(Graph *graph, Node *node) {
@@ -122,6 +102,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
   if (node->Op()->HasAttr(sMatmulSerializeMode)) {
     CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op());
   }
+  if (node->Op()->HasAttr(sAvailMemAttribute)) {
+    CopyOpAttr(sAvailMemAttribute, node->Op(), new_node->Op());
+  }
+  if (node->Op()->HasAttr(sOpNamescope)) {
+    CopyOpAttr(sOpNamescope, node->Op(), new_node->Op());
+  }
   {
     new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
     new_node->Op()->Flush();
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ed42d0792e..bbaa7e3dd6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -4264,6 +4264,7 @@ All parameter, weight, gradient are variables in Paddle.
                  platform::ipu::IpuBackend::GetInstance());
            },
            py::return_value_policy::reference)
+      .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost)
       .def("detach", &platform::ipu::IpuBackend::Detach)
       .def("reset", &platform::ipu::IpuBackend::Reset)
       .def("set_scope", &platform::ipu::IpuBackend::SetScope)
@@ -4311,6 +4312,15 @@ All parameter, weight, gradient are variables in Paddle.
                          option_name, option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
+                 } else if (option_name == "accumulate_outer_fragment") {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::vector<int> values;
+                     for (auto value : option.second.cast<py::list>()) {
+                       values.push_back(value.cast<int>());
+                     }
+                     self.SetAccumulateOuterFragmentSettings(
+                         option.first.cast<std::uint64_t>(), values);
+                   }
                  } else if (option_name == "custom_op") {
                    std::string paddle_op;
                    std::string popart_op;
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index f120f55949..debd9ed198 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -26,7 +26,13 @@ class TestIpuStrategy(unittest.TestCase):
     def test_set_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
+        skip_options = []
+        skip_options.append('random_seed')
+
         for option_name in all_option_names:
+            if option_name in skip_options:
+                continue
+
             option = ipu_strategy._ipu_strategy.get_option(option_name)
             option_type = option['type']
             option_value = option['value']
@@ -38,9 +44,13 @@ class TestIpuStrategy(unittest.TestCase):
                 set_value = not option_value
             else:
                 continue
-            ipu_strategy.set_options({option_name: set_value})
-            new_value = ipu_strategy.get_option(option_name)
-            assert new_value == set_value, f"set {option_name} to {set_value} failed"
+
+            try:
+                ipu_strategy.set_options({option_name: set_value})
+                new_value = ipu_strategy.get_option(option_name)
+                assert new_value == set_value, f"set {option_name} to {set_value} failed"
+            except:
+                raise Exception(f"set {option_name} to {set_value} failed")
 
     def test_set_string_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 3a69487306..ba6eb4d38b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -95,12 +95,9 @@ class TestBase(IPUOpTest):
                     is_training=self.attrs['is_training'])
                 ipu_strategy.set_precision_config(
                     enable_fp16=self.attrs['enable_fp16'])
-                ipu_strategy.set_options({
-                    'save_per_n_step': self.attrs['save_at_step']
-                })
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(
-                        self.feed_list, fetch_list)
+                ipu_program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy)
+                program = ipu_program.compile(self.feed_list, fetch_list)
 
                 result = []
                 run_steps = self.attrs['steps'] if save_otherwise_load \
@@ -111,10 +108,9 @@ class TestBase(IPUOpTest):
                 for i in range(run_steps):
                     tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
 
-                    # currently, we update opt state every sess.run,
-                    # will optimize
                     if save_otherwise_load and \
                         i == self.attrs['save_at_step'] - 1:
+                        ipu_program._backend.weights_to_host()
                         paddle.static.save(main_prog,
                                            self.attrs['model_path'].name)
 
-- 
GitLab