From fc621dfea375095fb8a3d39801224859b8cf6aa7 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Mon, 11 Apr 2022 16:04:13 +0800
Subject: [PATCH] support more ops (#41421)

---
 cmake/external/poplar.cmake                   | 15 +++++++++
 cmake/inference_lib.cmake                     |  7 +++-
 .../fluid/platform/device/ipu/ipu_compiler.cc | 32 +++++++++++++++----
 .../fluid/platform/device/ipu/ipu_strategy.cc |  1 +
 .../fluid/platform/device/ipu/ipu_strategy.h  | 28 ++++++++--------
 .../ipu/popart_canonicalization/math_ops.cc   |  2 +-
 .../ipu/popart_canonicalization/nn_ops.cc     | 27 ++++++++++++++++
 .../ipu/popart_canonicalization/tensor_ops.cc |  4 +--
 8 files changed, 93 insertions(+), 23 deletions(-)

diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake
index 7a8fa3ef5d7..8b2de14e966 100644
--- a/cmake/external/poplar.cmake
+++ b/cmake/external/poplar.cmake
@@ -12,6 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+macro(find_popart_version popart_version_file)
+  file(READ ${popart_version_file} popart_version_file_content)
+  string(REGEX MATCH "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")" POPART_VERSION ${popart_version_file_content})
+  string(REPLACE "POPART_VERSION_STRING" "" POPART_VERSION "${POPART_VERSION}")
+  string(REPLACE "\"" "" POPART_VERSION "${POPART_VERSION}")
+  string(REPLACE " " "" POPART_VERSION "${POPART_VERSION}")
+  if(NOT POPART_VERSION)
+    set(POPART_VERSION "Unknown version")
+  else()
+    message(STATUS "Current PopART version is ${POPART_VERSION}")
+  endif()
+endmacro()
+
 if(WITH_IPU)
   set(POPLAR_DIR CACHE PATH "Path to a Poplar install")
   set(POPART_DIR CACHE PATH "Path to a Popart install")
@@ -64,6 +77,8 @@ if(WITH_IPU)
     message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build")
   endif()
 
+  find_popart_version("${POPART_DIR}/include/popart/version.hpp")
+
   add_definitions(-DONNX_NAMESPACE=onnx)
   add_custom_target(extern_poplar DEPENDS poplar popart-only)
 endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e3e6e1cced2..1b38f208716 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -398,7 +398,8 @@ function(version version_file)
             "WITH_GPU: ${WITH_GPU}\n"
             "WITH_ROCM: ${WITH_ROCM}\n"
             "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n"
-            "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n")
+            "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n"
+            "WITH_IPU: ${WITH_IPU}\n")
     if(WITH_GPU)
         file(APPEND ${version_file}
                 "CUDA version: ${CUDA_VERSION}\n"
@@ -414,6 +415,10 @@ function(version version_file)
                 "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n"
                 "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n")
     endif()
+    if(WITH_IPU)
+        file(APPEND ${version_file}
+                "PopART version: ${POPART_VERSION}\n")
+    endif()
     file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
     if(TENSORRT_FOUND)
         file(APPEND ${version_file}
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index 1a3e600058b..7ae3b2303de 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -474,6 +474,7 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         auto adam_mode =
             AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer);
         auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        auto scaled_optimizer_state_ = ipu_strategy_->scaled_optimizer_state;
         if (weight_decay_mode_.empty()) {
           weight_decay_mode_ = BOOST_GET_CONST(
               std::string, op_desc->GetAttr("weight_decay_mode"));
@@ -492,7 +493,7 @@ void Compiler::LowerOptimizer(const Scope* scope) {
             auto optimizer_instance = std::make_unique<popart::Adam>(
                 optimizer_value, adam_mode, weight_decay_mode,
                 popart::DataType::UNDEFINED, accl1_type, accl2_type,
-                clip_norm_settings);
+                clip_norm_settings, scaled_optimizer_state_);
             for (int i = 0; i < weight_decay_vars.size(); i++) {
               optimizer_instance->insertSpecific(
                   weight_decay_vars[i],
@@ -511,11 +512,10 @@ void Compiler::LowerOptimizer(const Scope* scope) {
                 popart::OptimizerValue(loss_scaling, true),
                 popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
                 popart::DataType::UNDEFINED, accl1_type, accl2_type,
-                clip_norm_settings);
+                clip_norm_settings, scaled_optimizer_state_);
           }
         };
-        if (adam_mode == popart::AdamMode::Lamb ||
-            adam_mode == popart::AdamMode::LambNoBias) {
+        if (adam_mode == popart::AdamMode::Lamb) {
           const std::map<std::string, std::pair<float, bool>> optimizer_value =
               {{"defaultLearningRate", {0.0, false}},
                {"defaultBeta1", {beta1, false}},
@@ -526,7 +526,26 @@ void Compiler::LowerOptimizer(const Scope* scope) {
           auto eval_optimizer = std::make_unique<popart::Adam>(
               optimizer_value, adam_mode, weight_decay_mode,
               popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT, clip_norm_settings);
+              popart::DataType::FLOAT, clip_norm_settings,
+              scaled_optimizer_state_);
+          for (int i = 0; i < weight_decay_vars.size(); i++) {
+            eval_optimizer->insertSpecific(weight_decay_vars[i],
+                                           {{"weightDecay", {0.0, false}}});
+          }
+          resources_->eval_optimizer = std::move(eval_optimizer);
+        } else if (adam_mode == popart::AdamMode::LambNoBias) {
+          const std::map<std::string, std::pair<float, bool>> optimizer_value =
+              {{"defaultLearningRate", {0.0, false}},
+               {"defaultBeta1", {1.0, false}},
+               {"defaultBeta2", {1.0, false}},
+               {"defaultEps", {eps, true}},
+               {"lossScaling", {loss_scaling, true}},
+               {"defaultMaxWeightNorm", {mwn, true}}};
+          auto eval_optimizer = std::make_unique<popart::Adam>(
+              optimizer_value, adam_mode, weight_decay_mode,
+              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
+              popart::DataType::FLOAT, clip_norm_settings,
+              scaled_optimizer_state_);
           for (int i = 0; i < weight_decay_vars.size(); i++) {
             eval_optimizer->insertSpecific(weight_decay_vars[i],
                                            {{"weightDecay", {0.0, false}}});
@@ -542,7 +561,8 @@ void Compiler::LowerOptimizer(const Scope* scope) {
               popart::OptimizerValue(loss_scaling, true),
               popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
               popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT, clip_norm_settings);
+              popart::DataType::FLOAT, clip_norm_settings,
+              scaled_optimizer_state_);
         }
       } else if (type == "adaptive") {
         auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 6172d4d7dc6..f52499a8d8f 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -67,6 +67,7 @@ IpuStrategy::IpuStrategy() {
   ADD_BOOL_OPTION(transfer_cast_op);
   ADD_BOOL_OPTION(use_no_bias_optimizer);
   ADD_BOOL_OPTION(enable_distribution);
+  ADD_BOOL_OPTION(scaled_optimizer_state);
   ADD_UINT64_OPTION(num_ipus);
   ADD_UINT64_OPTION(batches_per_step);
   ADD_UINT64_OPTION(micro_batch_size);
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 786e2419cc0..1802eb16e58 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -37,13 +37,13 @@ class IpuStrategy {
   // training flag, true for training
   bool is_training = true;
 
-  // average sharding, debugging used
+  // Average sharding, debugging used
   bool need_avg_shard = false;
 
-  // flag for fp16, true for pure fp16
+  // Flag for fp16, true for pure fp16
   bool enable_fp16 = false;
 
-  // enable transfer cast Op target from fp32 to fp16 in fp16 mode
+  // Enable transfer cast Op target from fp32 to fp16 in fp16 mode
   bool transfer_cast_op = true;
 
   // The mode of Adam/Lamb optimizer
@@ -51,33 +51,35 @@ class IpuStrategy {
   // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
   bool use_no_bias_optimizer = false;
 
-  // enable distributed computing for POD128 or POD256
+  // Enable distributed computing for POD128 or POD256
   bool enable_distribution = false;
 
+  // Enable Scaled optimizer state only for Adam and Lamb
+  bool scaled_optimizer_state = false;
+
   // Number ipus total needed, local_replica * ipu_per_replica
   int num_ipus = 1;
 
-  // batches per step
+  // Batches per step
   int batches_per_step = 1;
 
-  // micro batch-size
+  // Micro batch-size
   int micro_batch_size = 1;
 
-  // random seed
+  // Random seed
   std::uint64_t random_seed = std::numeric_limits<std::uint64_t>::max();
 
-  // TODO(alleng) remove this param
-  // available memory proportion, 0.0f for disable
+  // Available memory proportion, 0.0f for disable
   float available_memory_proportion = 0.0f;
 
-  // loss scaling, currently we can't get loss scaling from
+  // Loss scaling, currently we can't get loss scaling from
   // optimizer_extract_pass, so we have to set it here
   float loss_scaling = 1.0f;
 
-  // defaultMaxWeightNorm for adam optimizer
+  // DefaultMaxWeightNorm for adam optimizer
   float max_weight_norm = 65504.0f;
 
-  // file path for dumping compiled model in onnx format
+  // File path for dumping compiled model in onnx format
   std::string onnx_dump_path;
 
   // Data type to use for tensor that stores first-order momentum optimizer
@@ -106,7 +108,7 @@ class IpuStrategy {
   // popart pattern manager
   popart::Patterns popart_patterns;
 
-  // custom ops
+  // Custom ops
   std::vector<IpuCustomOpIdentifier> custom_ops;
 
  public:
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index 9a907cf5e88..444b55959cf 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -157,7 +157,6 @@ Node *softmax_handler(Graph *graph, Node *node) {
 
 Node *scale_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
   auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
   auto bias_after_scale_ =
       BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
@@ -191,6 +190,7 @@ Node *scale_handler(Graph *graph, Node *node) {
       }
     }
   } else {
+    auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
     if (is_float_equal(bias_, 0.0) && is_float_equal(scale_, 1.0)) {
       return CreateBaseOp(graph, node, "popart_identity",
                           {GetInputVarNode("X", node)}, node->outputs, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index a529a34e6d7..a08fbaa26d9 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -95,6 +95,21 @@ Node *pool2d_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
   auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+  if (op->HasAttr("adaptive")) {
+    auto adaptive = BOOST_GET_CONST(bool, op->GetAttr("adaptive"));
+    if (adaptive) {
+      auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+      if (ksize[0] != 1 || ksize[1] != 1) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Only support pool_size=1 with adaptive mode."));
+      }
+      // adaptive maxpool op is max_pool2d_with_index. Only process avgpool
+      // here.
+      return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
+                          node->outputs);
+    }
+  }
+
   if (global_pooling) {
     if (pooling_type == "max") {
       return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
@@ -159,6 +174,17 @@ Node *pool2d_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *max_pool2d_with_index_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+  if (ksize[0] != 1 || ksize[1] != 1) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Only support pool_size=1 with adaptive mode."));
+  }
+  return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
+                      {GetOutputVarNode("Out", node)});
+}
+
 Node *group_norm_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
@@ -304,6 +330,7 @@ Node *dropout_handler(Graph *graph, Node *node) {
 }  // namespace paddle
 
 REGISTER_HANDLER(pool2d, pool2d_handler);
+REGISTER_HANDLER(max_pool2d_with_index, max_pool2d_with_index_handler);
 REGISTER_HANDLER(batch_norm, batch_norm_handler);
 REGISTER_HANDLER(group_norm, group_norm_handler);
 REGISTER_HANDLER(instance_norm, instance_norm_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 4c086bffb24..55c25bce159 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -331,7 +331,7 @@ Node *shape_handler(Graph *graph, Node *node) {
 Node *slice_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   Node *starts = nullptr;
-  if (!op->Input("StartsTensor").empty()) {
+  if (!op->HasAttr("starts")) {
     starts = GetInputVarNode("StartsTensor", node);
   } else {
     auto starts_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("starts"));
@@ -341,7 +341,7 @@ Node *slice_handler(Graph *graph, Node *node) {
     starts = starts->outputs[0];
   }
   Node *ends = nullptr;
-  if (!op->Input("EndsTensor").empty()) {
+  if (!op->HasAttr("ends")) {
     ends = GetInputVarNode("EndsTensor", node);
   } else {
     auto ends_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ends"));
-- 
GitLab