From 84f257bd820de552546ec29f7c7a2ae803671644 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Mon, 17 Jan 2022 12:40:47 +0800 Subject: [PATCH] [IPU] update ipu releated passes p0 (#38846) * update ipu releated passes Co-authored-by: Xiaobing Wang Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Haicheng Jiang Co-authored-by: Han Zhao * remove ipu_pass_base * update error msg * update error msg 02 * split pr 01 * restore ipu_pass_base Co-authored-by: Xiaobing Wang Co-authored-by: Zhixin Yao Co-authored-by: Haicheng Jiang Co-authored-by: Han Zhao --- .../fluid/framework/ir/ipu/avg_shard_pass.cc | 6 +- .../fluid/framework/ir/ipu/avg_shard_pass.h | 4 +- .../ir/ipu/forward_graph_extract_pass.h | 4 +- .../framework/ir/ipu/infer_shape_pass.cc | 90 +++-- .../fluid/framework/ir/ipu/infer_shape_pass.h | 4 +- .../ir/ipu/inference_postprocess_pass.h | 4 +- .../ir/ipu/inference_process_pass.cc | 49 ++- .../framework/ir/ipu/inference_process_pass.h | 4 +- .../ir/ipu/ipu_graph_builder_pass.cc | 3 +- .../framework/ir/ipu/ipu_graph_builder_pass.h | 4 +- .../fluid/framework/ir/ipu/ipu_inplace_pass.h | 4 +- .../ir/ipu/ipu_runtime_replacer_pass.cc | 13 - .../ir/ipu/ipu_runtime_replacer_pass.h | 4 +- .../ir/ipu/optimizer_extract_pass.cc | 329 +++++++++++++++--- .../framework/ir/ipu/optimizer_extract_pass.h | 5 +- .../ir/ipu/optimizer_state_align_pass.cc | 8 +- .../ir/ipu/optimizer_state_align_pass.h | 4 +- .../ir/ipu/popart_canonicalization_pass.cc | 42 ++- .../ir/ipu/popart_canonicalization_pass.h | 4 +- 19 files changed, 432 insertions(+), 153 deletions(-) diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc index 9dcbbb9c985..f1ee3c26b8f 100644 --- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc +++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc @@ -26,13 +26,15 @@ namespace ir { void AvgShardPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter AvgShardPass::ApplyImpl"; - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); if (ipu_backend->GetIpuStrategy()->need_avg_shard) { VLOG(10) << "start AvgShardPass"; auto nodes = ir::TopologySortOperations(*graph); auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus; + auto replica_factor = + ipu_backend->GetIpuStrategy()->popart_options.replicatedGraphCount; + num_ipus = num_ipus / replica_factor; int shard_position = nodes.size() / num_ipus; int index_and_stage = -1; diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h index b13acbd198d..fc96cfa95fe 100644 --- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h +++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class AvgShardPass : public IPUPassBase { +class AvgShardPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h index afa9f1c15f2..35d6a87c429 100644 --- a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h +++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h @@ -15,13 +15,13 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class ForwardGraphExtractPass : public IPUPassBase { +class ForwardGraphExtractPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc index 0878192ebf8..9ab70ee321a 100644 --- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc +++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc @@ -29,10 +29,10 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "Raw Graph: "; VLOG(10) << DebugString(graph); - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); - auto batch_size = ipu_backend->GetIpuStrategy()->batch_size; - + // Make batch_size fixed + bool need_infer_shape = false; + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); + auto micro_batch_size = ipu_backend->GetIpuStrategy()->micro_batch_size; auto feed_list = Get>("feed_list"); for (auto node : graph->Nodes()) { if (!node->IsVar()) { @@ -43,8 +43,9 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const { if (is_feed) { auto input_shape = node->Var()->GetShape(); if (input_shape[0] <= -1) { - input_shape[0] = batch_size; + input_shape[0] = micro_batch_size; node->Var()->SetShape(input_shape); + need_infer_shape = true; } // int64->int32 if (node->Var()->GetDataType() == proto::VarType::INT64) { @@ -54,44 +55,63 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const { } // temp scope for shape inference - std::shared_ptr scope( - new paddle::framework::Scope()); - for (auto node : graph->Nodes()) { - if (!node->IsVar()) { - continue; - } - auto var_desc = node->Var(); - auto* ptr = scope->Var(var_desc->Name()); - paddle::framework::InitializeVariable(ptr, var_desc->GetType()); + if (need_infer_shape) { + std::shared_ptr scope( + new paddle::framework::Scope()); + for (auto node : graph->Nodes()) { + if (!node->IsVar()) { + continue; + } + auto var_desc = node->Var(); + auto* ptr = scope->Var(var_desc->Name()); + paddle::framework::InitializeVariable(ptr, var_desc->GetType()); - auto tensor = ptr->GetMutable(); - tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape())); - } + auto tensor = ptr->GetMutable(); + tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape())); + } - // infer shape - auto nodes = ir::TopologySortOperations(*graph); - for (auto node : nodes) { - auto op_desc = node->Op(); - auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope); - op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx); + // infer shape + auto nodes = ir::TopologySortOperations(*graph); + for (auto node : nodes) { + VLOG(10) << "InferShapePass: Infer shape for Op (" << node->Name() << ")"; + auto op_desc = node->Op(); + if (op_desc->Type() == "popart_optimizer") { + continue; + } + auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); + paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), + *scope); + op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx); - for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) { - for (int i = 0; i < it->second.size(); i++) { - auto output_name = op_desc->Output(it->first)[i]; - auto dim = - it->second[i]->GetMutable()->dims(); - auto new_shape = paddle::framework::vectorize(dim); - for (auto output_node : node->outputs) { - if (output_node->Name() == output_name) { - output_node->Var()->SetShape(new_shape); + for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) { + for (int i = 0; i < it->second.size(); i++) { + auto output_name = op_desc->Output(it->first)[i]; + auto dim = + it->second[i]->GetMutable()->dims(); + auto new_shape = paddle::framework::vectorize(dim); + for (auto output_node : node->outputs) { + if (output_node->Name() == output_name) { + output_node->Var()->SetShape(new_shape); + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "InferShapePass: output[" << output_node->Name() + << "], infer shape:["; + for (auto s : new_shape) { + sout << std::to_string(s) << ", "; + } + sout << "]"; + VLOG(10) << sout.str(); + } + } } } } + VLOG(10) << "InferShapePass: Infer shape for Op (" << node->Name() + << ") finished"; } + // release the temp scope + scope.reset(); } - // release the temp scope - scope.reset(); VLOG(10) << "Post Graph: "; VLOG(10) << DebugString(graph); diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h index 3e8148b7f06..e2d70ff40ab 100644 --- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h +++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class InferShapePass : public IPUPassBase { +class InferShapePass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h index e80e1905d4a..0bcbaef83f9 100644 --- a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h +++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class InferencePostprocessPass : public IPUPassBase { +class InferencePostprocessPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc index d02dcce0cc6..05490773301 100644 --- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc @@ -29,8 +29,7 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter InferenceProcessPass::ApplyImpl"; // Get a new instance of ipu_backend - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetNewInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); // Set scope auto& scope = graph->Get(kParamScopeAttr); @@ -40,18 +39,34 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { static std::shared_ptr ipu_strategy_instance_( new platform::ipu::IpuStrategy()); ipu_strategy_instance_->is_training = false; + // Set graph replication + auto replica_num = graph->Get("replica_num"); + if (replica_num > 1) { + ipu_strategy_instance_->popart_options.enableReplicatedGraphs = true; + ipu_strategy_instance_->popart_options.replicatedGraphCount = replica_num; + } + // Set the num of IPUs auto num_ipus = graph->Get("num_ipus"); - ipu_strategy_instance_->num_ipus = num_ipus; + // Set sharding if (num_ipus > 1) { - ipu_strategy_instance_->popart_options_.virtualGraphMode = + ipu_strategy_instance_->need_avg_shard = true; + ipu_strategy_instance_->popart_options.virtualGraphMode = platform::ipu::VirtualGraphMode::Manual; } else { - ipu_strategy_instance_->popart_options_.virtualGraphMode = + ipu_strategy_instance_->need_avg_shard = false; + ipu_strategy_instance_->popart_options.virtualGraphMode = platform::ipu::VirtualGraphMode::Off; } + // total num IPUs = num_ipus * replica_num + ipu_strategy_instance_->num_ipus = num_ipus * replica_num; + + // Set micro_batch_size for shape inference + ipu_strategy_instance_->micro_batch_size = + graph->Get("micro_batch_size"); + // Set pipelining auto enable_pipelining = graph->Get("enable_pipelining"); - ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining; + ipu_strategy_instance_->popart_options.enablePipelining = enable_pipelining; if (enable_pipelining) { auto batches_per_step = graph->Get("batches_per_step"); PADDLE_ENFORCE_GE( @@ -60,8 +75,20 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { "greater than the number of IPUs")); ipu_strategy_instance_->batches_per_step = batches_per_step; } - ipu_strategy_instance_->batch_size = graph->Get("batch_size"); - ipu_strategy_instance_->need_avg_shard = graph->Get("need_avg_shard"); + + // Set FP16 + auto enable_fp16 = graph->Get("enable_fp16"); + ipu_strategy_instance_->enable_fp16 = enable_fp16; + if (enable_fp16) { + auto enable_half_partial = graph->Get("enable_half_partial"); + if (enable_half_partial) { + ipu_strategy_instance_->popart_options.partialsTypeMatMuls = "half"; + } + } + + // Set available memory proportion for matmul/conv + ipu_strategy_instance_->available_memory_proportion = + graph->Get("available_memory_proportion"); ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get())); @@ -94,9 +121,9 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { } // Run passes - std::vector graph_pass = {"forward_graph_extract_pass", - "infer_shape_pass", "avg_shard_pass", - "popart_canonicalization_pass"}; + std::vector graph_pass = { + "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass", + "popart_canonicalization_pass", "transfer_cast_op_pass"}; std::vector compile_pass = { "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass", "inference_postprocess_pass"}; diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.h b/paddle/fluid/framework/ir/ipu/inference_process_pass.h index bac0e88377f..bd6c9d1914e 100644 --- a/paddle/fluid/framework/ir/ipu/inference_process_pass.h +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class InferenceProcessPass : public IPUPassBase { +class InferenceProcessPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc index 5a53466089b..17269ac8889 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc +++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc @@ -32,8 +32,7 @@ void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const { std::vector fetch_list; fetch_list = Get>("fetch_list"); - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); ipu_backend->Compile(graph, feed_list, fetch_list); diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h index 6237df36480..7f60cf6a34b 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h +++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h @@ -15,13 +15,13 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuGraphBuilderPass : public IPUPassBase { +class IpuGraphBuilderPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h index 86756276c8c..8c42d5db1af 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h +++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuInplacePass : public IPUPassBase { +class IpuInplacePass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc index a3e020714e1..65ebd3ec808 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc +++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc @@ -56,19 +56,6 @@ void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const { } } - // set ipu_runtime_op dtype attr - if (fetch_list.size() == 1) { - for (auto* node : graph->Nodes()) { - if (node->IsVar()) { - for (auto fetch : fetch_list) { - if (node->Name() == fetch) { - ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType()); - } - } - } - } - } - // Remove unneeded nodes. std::unordered_set marked_nodes; for (auto* node : graph->Nodes()) { diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h index ba2cc8702fa..993b21e943d 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h +++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h @@ -15,13 +15,13 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuRuntimeReplacerPass : public IPUPassBase { +class IpuRuntimeReplacerPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index c6be2c775bd..3d8d353cbf5 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -15,72 +15,303 @@ #include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -#include "paddle/fluid/platform/device/ipu/ipu_backend.h" namespace paddle { namespace framework { namespace ir { -void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { - VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl"; - VLOG(10) << "Raw Graph: "; - VLOG(10) << DebugString(graph); +std::set ignored_ops = { + "sign", + "sum", + "clip", + "clip_by_norm", + "square", + "reduce_sum", + "sqrt", + "elementwise_max", + "elementwise_div", + "elementwise_mul", + "scale", // adamax + "assign", // adamw +}; + +const bool startswith(const std::string& str, const std::string& pre) { + if (str.rfind(pre, 0) == 0) { + return true; + } else { + return false; + } +} + +const bool is_grad_clip_op(const std::string& op_namescope) { + return startswith(op_namescope, "/gradient_clip"); +} - auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance(); +const bool is_optimizer_op(const std::string& op_namescope) { + return startswith(op_namescope, "/optimizer"); +} + +const bool is_regularization_op(const std::string& op_namescope) { + return startswith(op_namescope, "/regularization"); +} +void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { + // 这里构建的 op 符合 popart 的定义, 涉及到的一些值需要在 LowerOptimier 时获得 + OpDesc new_op("popart_optimizer", {}, {}, {}); + new_op.SetAttr("op_role", 0); + new_op.SetAttr("with_lr_sched", false); + + std::set set_ops{}; + // use map store ? for (auto* node : graph->Nodes()) { - if (node->IsOp() && node->Op()) { - int op_role = BOOST_GET_CONST( - int, node->Op()->GetAttr( - framework::OpProtoAndCheckerMaker::OpRoleAttrName())); - - // graph usually have multiple optimizer node for different parameter, - // and these node have the same type and attr value usually - if ((op_role == static_cast(framework::OpRole::kOptimize))) { - ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type()); - VLOG(10) << "found optimizer type: " << node->Op()->Type(); - - for (const std::string& attr_name : node->Op()->AttrNames()) { - auto attr_type = node->Op()->GetAttrType(attr_name); - // with adam, attr are float - if (attr_type == proto::AttrType::FLOAT) { - auto attr_value = - BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name)); - ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value); - } else { - VLOG(10) << "Skip " << attr_type; - } - } + if (!node->IsOp()) { + continue; + } - auto lr_var_name = node->Op()->Input("LearningRate"); - PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u, - platform::errors::InvalidArgument( - "In op(%s), find input(LearningRate) failed.", - node->Op()->Type())); + auto op = node->Op(); + auto op_type = op->Type(); + int op_role_ = BOOST_GET_CONST( + int, op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + auto op_role = static_cast(op_role_); - ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]); + if (op_role == OpRole::kOptimize) { + if (set_ops.count(op_type)) { + continue; } - if ((op_role == static_cast(framework::OpRole::kLoss))) { - VLOG(10) << "found loss op type: " << node->Op()->Type(); - auto outputs = node->Op()->Outputs(); - PADDLE_ENFORCE_EQ( - outputs.size(), 1, - platform::errors::InvalidArgument("Can only support one loss key")); - - auto losses_name = outputs.begin()->second; - PADDLE_ENFORCE_EQ(losses_name.size(), 1, - platform::errors::InvalidArgument( - "Can only support one loss name")); + auto op_namescope = + BOOST_GET_CONST(std::string, op->GetAttr("op_namescope")); + bool is_grad_clip = is_grad_clip_op(op_namescope); + // bool is_optimizer = is_optimizer_op(op_namescope); + bool is_regularization = is_regularization_op(op_namescope); - ipu_backend->GetExecutor().SetLoss(losses_name[0]); + VLOG(10) << "found optimizer releated op: " << op_type; + // initial larning_rate will be set in LowerOptimier + set_ops.insert(op_type); + if (op_type == "sgd") { + auto type = std::string{"sgd"}; + auto lr_var = op->Input("LearningRate").front(); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("momentum", 0.0f); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "momentum") { + auto type = std::string{"sgd"}; + // auto LearningRate = op->Input("LearningRate"); + auto use_nesterov = BOOST_GET_CONST(bool, op->GetAttr("use_nesterov")); + PADDLE_ENFORCE_EQ(use_nesterov, false, + platform::errors::Unimplemented( + "ipu does not support nesterov mode.")); + auto regularization_method = + BOOST_GET_CONST(std::string, op->GetAttr("regularization_method")); + PADDLE_ENFORCE_NE(regularization_method, "l1_decay", + platform::errors::Unimplemented( + "ipu does not support l1_decay mode.")); + auto multi_precision = + BOOST_GET_CONST(bool, op->GetAttr("multi_precision")); + PADDLE_ENFORCE_EQ(multi_precision, false, + platform::errors::Unimplemented( + "ipu does not support multi_precision mode.")); + auto rescale_grad = BOOST_GET_CONST(float, op->GetAttr("rescale_grad")); + PADDLE_ENFORCE_EQ(rescale_grad, 1.0, + platform::errors::Unimplemented( + "ipu does not support rescale_grad mode.")); + auto regularization_coeff = + BOOST_GET_CONST(float, op->GetAttr("regularization_coeff")); + auto lr_var = op->Input("LearningRate").front(); + auto momentum = BOOST_GET_CONST(float, op->GetAttr("mu")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("momentum", momentum); + new_op.SetAttr("weight_decay", regularization_coeff); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "adam" || op_type == "adamw") { + auto type = std::string{"adam"}; + auto lr_var = op->Input("LearningRate").front(); + auto beta1 = BOOST_GET_CONST(float, op->GetAttr("beta1")); + auto beta2 = BOOST_GET_CONST(float, op->GetAttr("beta2")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + auto lazy_mode = BOOST_GET_CONST(bool, op->GetAttr("lazy_mode")); + auto multi_precision = + BOOST_GET_CONST(bool, op->GetAttr("multi_precision")); + PADDLE_ENFORCE_EQ(lazy_mode, false, + platform::errors::Unimplemented( + "ipu does not support lazy_mode mode.")); + PADDLE_ENFORCE_EQ(multi_precision, false, + platform::errors::Unimplemented( + "ipu does not support multi_precision mode.")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("beta1", beta1); + new_op.SetAttr("beta2", beta2); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("adam_mode", std::string{"adam"}); + // adam or adamw + if (op_type == "adam") { + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", std::string{"adam"}); + } else { + new_op.SetAttr("weight_decay_mode", std::string{"decay"}); + new_op.SetAttr("raw_type", std::string{"adamw"}); + } + } else if (op_type == "adamax") { + auto type = std::string{"adam"}; + auto lr_var = op->Input("LearningRate").front(); + auto beta1 = BOOST_GET_CONST(float, op->GetAttr("beta1")); + auto beta2 = BOOST_GET_CONST(float, op->GetAttr("beta2")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("beta1", beta1); + new_op.SetAttr("beta2", beta2); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("adam_mode", std::string{"adamax"}); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "lamb") { + // use decay mode + auto type = std::string{"adam"}; + auto lr_var = op->Input("LearningRate").front(); + auto weight_decay = BOOST_GET_CONST(float, op->GetAttr("weight_decay")); + auto beta1 = BOOST_GET_CONST(float, op->GetAttr("beta1")); + auto beta2 = BOOST_GET_CONST(float, op->GetAttr("beta2")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", weight_decay); + new_op.SetAttr("beta1", beta1); + new_op.SetAttr("beta2", beta2); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("adam_mode", std::string{"lamb"}); + new_op.SetAttr("weight_decay_mode", std::string{"decay"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "adadelta") { + // NO LearningRate + auto type = std::string{"adaptive"}; + auto rho = BOOST_GET_CONST(float, op->GetAttr("rho")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("alpha", rho); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("momentum", 0.0f); + new_op.SetAttr("adaptive_mode", std::string{"adadelta"}); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "adagrad") { + auto type = std::string{"adaptive"}; + auto lr_var = op->Input("LearningRate").front(); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + // `alpha` use default + new_op.SetAttr("alpha", 0.99f); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("momentum", 0.0f); + new_op.SetAttr("adaptive_mode", std::string{"adagrad"}); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "rmsprop") { + auto type = std::string{"adaptive"}; + auto lr_var = op->Input("LearningRate").front(); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + auto decay = BOOST_GET_CONST(float, op->GetAttr("decay")); + auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum")); + auto centered = BOOST_GET_CONST(bool, op->GetAttr("centered")); + new_op.SetAttr("type", type); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("alpha", decay); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("momentum", momentum); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + if (centered) { + new_op.SetAttr("adaptive_mode", std::string{"centered_rmsprop"}); + new_op.SetAttr("raw_type", op_type); + } else { + new_op.SetAttr("adaptive_mode", std::string{"rmsprop"}); + new_op.SetAttr("raw_type", op_type); + } + } else if (is_regularization && op_type == "scale") { + // set weight_decay for L2Decay + auto scale = BOOST_GET_CONST(float, op->GetAttr("scale")); + new_op.SetAttr("weight_decay", scale); + } else if (is_grad_clip && op_type == "fill_constant") { + // set clip_norm for ClipGradByGlobalNorm + auto value = BOOST_GET_CONST(float, op->GetAttr("value")); + new_op.SetAttr("clip_norm", value); + } else if (ignored_ops.count(op_type)) { + VLOG(10) << "Ignore optimizer releated op: " << op_type; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unknown optimizer releated op_type: %s", op_type)); } + } else if (op_role == OpRole::kLoss) { + VLOG(10) << "found loss op type: " << op->Type(); + auto outputs = op->Outputs(); + PADDLE_ENFORCE_EQ( + outputs.size(), 1, + platform::errors::InvalidArgument("Can only support one loss key")); + auto losses = outputs.begin()->second; + PADDLE_ENFORCE_EQ( + losses.size(), 1, + platform::errors::InvalidArgument("Can only support one loss name")); + auto loss_var = losses.front(); + new_op.SetAttr("loss_var", loss_var); + } else if (op_role == OpRole::kLRSched) { + // op_role == OpRole::kLRSched | OpRole::kOptimize + new_op.SetAttr("with_lr_sched", true); + } + } + + // seems with_lr_sched is always true + new_op.SetAttr("with_lr_sched", true); + + // setup weight deacy + // weight_decay/coeff is "scale" attr of scale_op + if (set_ops.count("scale") && set_ops.count("sum")) { + if (set_ops.count("sign")) { + // L1Decay + // sign + scale + sum + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported L1Decay regularizer")); + } else { + // L2Decay + // scale + sum + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); } + } else { + VLOG(10) << "No weight deacy setting found"; + } + + // setup grad clip + if (set_ops.count("clip")) { + // ClipGradByValue + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported ClipGradByValue")); + } else if (set_ops.count("clip_by_norm")) { + // ClipGradByNorm + PADDLE_THROW(platform::errors::Unimplemented("Unsupported ClipGradByNorm")); } - VLOG(10) << "Post Graph: "; - VLOG(10) << DebugString(graph); - VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl"; + // ClipGradByGlobalNorm + // use graph pattern match ClipGradByGlobalNorm + // square + reduce_sum + sum + sqrt + fill_constant + // + elementwise_max + elementwise_div + elementwise_mul + // clip_norm from fill_constant`s attr `value` dtype float + + if (new_op.HasAttr("type")) { + auto new_node = graph->CreateOpNode(&new_op); + VLOG(10) << "New Optimizer Node:"; + VLOG(10) << DebugString(new_node); + } else { + PADDLE_THROW(platform::errors::NotFound( + "No optimizer found, optimizer must be one of these types: sgd, " + "momentum, adam, adamw, adamax, lamb, adadelta, adagrad or rmsprop")); + } } } // namespace ir diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h index fd274ded8f5..8b015fdc4f8 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h @@ -14,14 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuOptimizerExtractPass : public IPUPassBase { +class IpuOptimizerExtractPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc index c23bfdcb154..4da913e7176 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc @@ -14,23 +14,19 @@ #include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -#include "paddle/fluid/platform/device/ipu/common.h" #include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_names.h" namespace paddle { namespace framework { namespace ir { -using paddle::platform::ipu::IpuBackend; -using framework::ir::Graph; -using framework::ir::Node; - void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl"; VLOG(10) << "Raw Graph: "; VLOG(10) << DebugString(graph); - auto ipu_backend = IpuBackend::GetInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); const auto* scope_ = ipu_backend->GetScope(); for (auto* node : graph->Nodes()) { diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h index 21a1017d884..96aca44d84a 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h +++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { @@ -26,7 +26,7 @@ namespace ir { * include Adam/Lamb. */ -class IpuOptimizerStateAlignPass : public IPUPassBase { +class IpuOptimizerStateAlignPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc index d2d76f9a9a2..975a4b62cc7 100644 --- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc @@ -21,15 +21,13 @@ namespace paddle { namespace framework { namespace ir { -using framework::ir::Graph; -using framework::ir::Node; -using platform::ipu::SymbolHandler; - void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl"; VLOG(10) << "Raw Graph: "; VLOG(10) << DebugString(graph); + auto custom_ops = Get>("custom_ops"); + std::vector missing_ops; auto nodes = graph->Nodes(); for (auto* node : nodes) { if (!node->IsOp()) { @@ -39,21 +37,40 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { auto op_type = op->Type(); ir::Node* new_node = nullptr; - SymbolHandler handler = platform::ipu::GetHandler(op_type); + platform::ipu::SymbolHandler handler = platform::ipu::GetHandler(op_type); + if (!handler && !custom_ops.empty()) { + if (custom_ops.count(op_type)) { + VLOG(10) << "Found custom op: " << op_type; + handler = platform::ipu::GetHandler("custom_op"); + } + } + if (handler) { VLOG(11) << "Raw Paddle Node:"; VLOG(11) << node->Op()->Proto()->DebugString(); new_node = handler(graph, node); - VLOG(11) << "Post Popart Node:"; - VLOG(11) << new_node->Op()->Proto()->DebugString(); - - platform::ipu::ClearNode(node); - graph->RemoveNode(node); + if (new_node) { + VLOG(11) << "Post Popart Node:"; + VLOG(11) << new_node->Op()->Proto()->DebugString(); + platform::ipu::ClearNode(node); + graph->RemoveNode(node); + } } else { - LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type; + missing_ops.push_back(op_type); } } + if (!missing_ops.empty()) { + LOG(ERROR) << "Can not find OpHandler for op_type: "; + for (auto& op_type : missing_ops) { + LOG(ERROR) << op_type; + } + PADDLE_THROW(platform::errors::Unimplemented( + "Found unimplemented op_handler(s) for IPU")); + } + + // post popart_canonicalization + VLOG(10) << "Post Graph: "; VLOG(10) << DebugString(graph); VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl"; @@ -64,4 +81,5 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { } // namespace paddle REGISTER_PASS(popart_canonicalization_pass, - paddle::framework::ir::PopartCanonicalizationPass); + paddle::framework::ir::PopartCanonicalizationPass) + .DefaultPassAttr("custom_ops", new std::unordered_set{}); diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h index 6690873f2a9..b01453ea366 100644 --- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class PopartCanonicalizationPass : public IPUPassBase { +class PopartCanonicalizationPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; -- GitLab