From 9f2ae3600b2443267e23a9f409d3712a3d54436c Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Fri, 15 Apr 2022 11:24:02 +0800 Subject: [PATCH] support more ops (#41421) (#41731) --- cmake/external/poplar.cmake | 15 +++++++++ cmake/inference_lib.cmake | 7 +++- .../fluid/platform/device/ipu/ipu_compiler.cc | 32 +++++++++++++++---- .../fluid/platform/device/ipu/ipu_strategy.cc | 1 + .../fluid/platform/device/ipu/ipu_strategy.h | 28 ++++++++-------- .../ipu/popart_canonicalization/math_ops.cc | 2 +- .../ipu/popart_canonicalization/nn_ops.cc | 27 ++++++++++++++++ .../ipu/popart_canonicalization/tensor_ops.cc | 4 +-- 8 files changed, 93 insertions(+), 23 deletions(-) diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake index 7a8fa3ef5d7..8b2de14e966 100644 --- a/cmake/external/poplar.cmake +++ b/cmake/external/poplar.cmake @@ -12,6 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +macro(find_popart_version popart_version_file) + file(READ ${popart_version_file} popart_version_file_content) + string(REGEX MATCH "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")" POPART_VERSION ${popart_version_file_content}) + string(REPLACE "POPART_VERSION_STRING" "" POPART_VERSION "${POPART_VERSION}") + string(REPLACE "\"" "" POPART_VERSION "${POPART_VERSION}") + string(REPLACE " " "" POPART_VERSION "${POPART_VERSION}") + if(NOT POPART_VERSION) + set(POPART_VERSION "Unknown version") + else() + message(STATUS "Current PopART version is ${POPART_VERSION}") + endif() +endmacro() + if(WITH_IPU) set(POPLAR_DIR CACHE PATH "Path to a Poplar install") set(POPART_DIR CACHE PATH "Path to a Popart install") @@ -64,6 +77,8 @@ if(WITH_IPU) message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build") endif() + find_popart_version("${POPART_DIR}/include/popart/version.hpp") + add_definitions(-DONNX_NAMESPACE=onnx) add_custom_target(extern_poplar DEPENDS poplar popart-only) endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index e3e6e1cced2..1b38f208716 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -398,7 +398,8 @@ function(version version_file) "WITH_GPU: ${WITH_GPU}\n" "WITH_ROCM: ${WITH_ROCM}\n" "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" - "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n") + "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n" + "WITH_IPU: ${WITH_IPU}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" @@ -414,6 +415,10 @@ function(version version_file) "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") endif() + if(WITH_IPU) + file(APPEND ${version_file} + "PopART version: ${POPART_VERSION}\n") + endif() file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") if(TENSORRT_FOUND) file(APPEND ${version_file} diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index 1a3e600058b..7ae3b2303de 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -474,6 +474,7 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto adam_mode = AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer); auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode; + auto scaled_optimizer_state_ = ipu_strategy_->scaled_optimizer_state; if (weight_decay_mode_.empty()) { weight_decay_mode_ = BOOST_GET_CONST( std::string, op_desc->GetAttr("weight_decay_mode")); @@ -492,7 +493,7 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto optimizer_instance = std::make_unique( optimizer_value, adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, accl1_type, accl2_type, - clip_norm_settings); + clip_norm_settings, scaled_optimizer_state_); for (int i = 0; i < weight_decay_vars.size(); i++) { optimizer_instance->insertSpecific( weight_decay_vars[i], @@ -511,11 +512,10 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(loss_scaling, true), popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, accl1_type, accl2_type, - clip_norm_settings); + clip_norm_settings, scaled_optimizer_state_); } }; - if (adam_mode == popart::AdamMode::Lamb || - adam_mode == popart::AdamMode::LambNoBias) { + if (adam_mode == popart::AdamMode::Lamb) { const std::map> optimizer_value = {{"defaultLearningRate", {0.0, false}}, {"defaultBeta1", {beta1, false}}, @@ -526,7 +526,26 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto eval_optimizer = std::make_unique( optimizer_value, adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT, clip_norm_settings); + popart::DataType::FLOAT, clip_norm_settings, + scaled_optimizer_state_); + for (int i = 0; i < weight_decay_vars.size(); i++) { + eval_optimizer->insertSpecific(weight_decay_vars[i], + {{"weightDecay", {0.0, false}}}); + } + resources_->eval_optimizer = std::move(eval_optimizer); + } else if (adam_mode == popart::AdamMode::LambNoBias) { + const std::map> optimizer_value = + {{"defaultLearningRate", {0.0, false}}, + {"defaultBeta1", {1.0, false}}, + {"defaultBeta2", {1.0, false}}, + {"defaultEps", {eps, true}}, + {"lossScaling", {loss_scaling, true}}, + {"defaultMaxWeightNorm", {mwn, true}}}; + auto eval_optimizer = std::make_unique( + optimizer_value, adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, popart::DataType::FLOAT, + popart::DataType::FLOAT, clip_norm_settings, + scaled_optimizer_state_); for (int i = 0; i < weight_decay_vars.size(); i++) { eval_optimizer->insertSpecific(weight_decay_vars[i], {{"weightDecay", {0.0, false}}}); @@ -542,7 +561,8 @@ void Compiler::LowerOptimizer(const Scope* scope) { popart::OptimizerValue(loss_scaling, true), popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT, clip_norm_settings); + popart::DataType::FLOAT, clip_norm_settings, + scaled_optimizer_state_); } } else if (type == "adaptive") { auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha")); diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index 6172d4d7dc6..f52499a8d8f 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -67,6 +67,7 @@ IpuStrategy::IpuStrategy() { ADD_BOOL_OPTION(transfer_cast_op); ADD_BOOL_OPTION(use_no_bias_optimizer); ADD_BOOL_OPTION(enable_distribution); + ADD_BOOL_OPTION(scaled_optimizer_state); ADD_UINT64_OPTION(num_ipus); ADD_UINT64_OPTION(batches_per_step); ADD_UINT64_OPTION(micro_batch_size); diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 786e2419cc0..1802eb16e58 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -37,13 +37,13 @@ class IpuStrategy { // training flag, true for training bool is_training = true; - // average sharding, debugging used + // Average sharding, debugging used bool need_avg_shard = false; - // flag for fp16, true for pure fp16 + // Flag for fp16, true for pure fp16 bool enable_fp16 = false; - // enable transfer cast Op target from fp32 to fp16 in fp16 mode + // Enable transfer cast Op target from fp32 to fp16 in fp16 mode bool transfer_cast_op = true; // The mode of Adam/Lamb optimizer @@ -51,33 +51,35 @@ class IpuStrategy { // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART bool use_no_bias_optimizer = false; - // enable distributed computing for POD128 or POD256 + // Enable distributed computing for POD128 or POD256 bool enable_distribution = false; + // Enable Scaled optimizer state only for Adam and Lamb + bool scaled_optimizer_state = false; + // Number ipus total needed, local_replica * ipu_per_replica int num_ipus = 1; - // batches per step + // Batches per step int batches_per_step = 1; - // micro batch-size + // Micro batch-size int micro_batch_size = 1; - // random seed + // Random seed std::uint64_t random_seed = std::numeric_limits::max(); - // TODO(alleng) remove this param - // available memory proportion, 0.0f for disable + // Available memory proportion, 0.0f for disable float available_memory_proportion = 0.0f; - // loss scaling, currently we can't get loss scaling from + // Loss scaling, currently we can't get loss scaling from // optimizer_extract_pass, so we have to set it here float loss_scaling = 1.0f; - // defaultMaxWeightNorm for adam optimizer + // DefaultMaxWeightNorm for adam optimizer float max_weight_norm = 65504.0f; - // file path for dumping compiled model in onnx format + // File path for dumping compiled model in onnx format std::string onnx_dump_path; // Data type to use for tensor that stores first-order momentum optimizer @@ -106,7 +108,7 @@ class IpuStrategy { // popart pattern manager popart::Patterns popart_patterns; - // custom ops + // Custom ops std::vector custom_ops; public: diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc index 9a907cf5e88..444b55959cf 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc @@ -157,7 +157,6 @@ Node *softmax_handler(Graph *graph, Node *node) { Node *scale_handler(Graph *graph, Node *node) { auto *op = node->Op(); - auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale")); auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias")); auto bias_after_scale_ = BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale")); @@ -191,6 +190,7 @@ Node *scale_handler(Graph *graph, Node *node) { } } } else { + auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale")); if (is_float_equal(bias_, 0.0) && is_float_equal(scale_, 1.0)) { return CreateBaseOp(graph, node, "popart_identity", {GetInputVarNode("X", node)}, node->outputs, {}); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc index a529a34e6d7..a08fbaa26d9 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc @@ -95,6 +95,21 @@ Node *pool2d_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type")); auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling")); + if (op->HasAttr("adaptive")) { + auto adaptive = BOOST_GET_CONST(bool, op->GetAttr("adaptive")); + if (adaptive) { + auto ksize = BOOST_GET_CONST(std::vector, op->GetAttr("ksize")); + if (ksize[0] != 1 || ksize[1] != 1) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support pool_size=1 with adaptive mode.")); + } + // adaptive maxpool op is max_pool2d_with_index. Only process avgpool + // here. + return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs, + node->outputs); + } + } + if (global_pooling) { if (pooling_type == "max") { return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs, @@ -159,6 +174,17 @@ Node *pool2d_handler(Graph *graph, Node *node) { } } +Node *max_pool2d_with_index_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto ksize = BOOST_GET_CONST(std::vector, op->GetAttr("ksize")); + if (ksize[0] != 1 || ksize[1] != 1) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support pool_size=1 with adaptive mode.")); + } + return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs, + {GetOutputVarNode("Out", node)}); +} + Node *group_norm_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon")); @@ -304,6 +330,7 @@ Node *dropout_handler(Graph *graph, Node *node) { } // namespace paddle REGISTER_HANDLER(pool2d, pool2d_handler); +REGISTER_HANDLER(max_pool2d_with_index, max_pool2d_with_index_handler); REGISTER_HANDLER(batch_norm, batch_norm_handler); REGISTER_HANDLER(group_norm, group_norm_handler); REGISTER_HANDLER(instance_norm, instance_norm_handler); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc index 4c086bffb24..55c25bce159 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc @@ -331,7 +331,7 @@ Node *shape_handler(Graph *graph, Node *node) { Node *slice_handler(Graph *graph, Node *node) { auto *op = node->Op(); Node *starts = nullptr; - if (!op->Input("StartsTensor").empty()) { + if (!op->HasAttr("starts")) { starts = GetInputVarNode("StartsTensor", node); } else { auto starts_ = BOOST_GET_CONST(std::vector, op->GetAttr("starts")); @@ -341,7 +341,7 @@ Node *slice_handler(Graph *graph, Node *node) { starts = starts->outputs[0]; } Node *ends = nullptr; - if (!op->Input("EndsTensor").empty()) { + if (!op->HasAttr("ends")) { ends = GetInputVarNode("EndsTensor", node); } else { auto ends_ = BOOST_GET_CONST(std::vector, op->GetAttr("ends")); -- GitLab