diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc index 9dcbbb9c9856e5e40d4d79bebc3cdac42cf407c8..f1ee3c26b8f4840d7ed7bbb1991bd1e73eee4175 100644 --- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc +++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc @@ -26,13 +26,15 @@ namespace ir { void AvgShardPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter AvgShardPass::ApplyImpl"; - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); if (ipu_backend->GetIpuStrategy()->need_avg_shard) { VLOG(10) << "start AvgShardPass"; auto nodes = ir::TopologySortOperations(*graph); auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus; + auto replica_factor = + ipu_backend->GetIpuStrategy()->popart_options.replicatedGraphCount; + num_ipus = num_ipus / replica_factor; int shard_position = nodes.size() / num_ipus; int index_and_stage = -1; diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h index b13acbd198dd524d7e8c14eb03ca1dbc367b257a..fc96cfa95fe512ce56e5ba0cf6990becd8fd66ca 100644 --- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h +++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class AvgShardPass : public IPUPassBase { +class AvgShardPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h index afa9f1c15f2ab8c3c27dda9aa3485320725f0ea4..35d6a87c429d7bf1d7c5e17091fac7b47b9df227 100644 --- a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h +++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h @@ -15,13 +15,13 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class ForwardGraphExtractPass : public IPUPassBase { +class ForwardGraphExtractPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc index 0878192ebf8d529bbfa8d3dfcc5ad2d821e24813..9ab70ee321a95c38b82482850c129592d633798e 100644 --- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc +++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc @@ -29,10 +29,10 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "Raw Graph: "; VLOG(10) << DebugString(graph); - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); - auto batch_size = ipu_backend->GetIpuStrategy()->batch_size; - + // Make batch_size fixed + bool need_infer_shape = false; + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); + auto micro_batch_size = ipu_backend->GetIpuStrategy()->micro_batch_size; auto feed_list = Get>("feed_list"); for (auto node : graph->Nodes()) { if (!node->IsVar()) { @@ -43,8 +43,9 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const { if (is_feed) { auto input_shape = node->Var()->GetShape(); if (input_shape[0] <= -1) { - input_shape[0] = batch_size; + input_shape[0] = micro_batch_size; node->Var()->SetShape(input_shape); + need_infer_shape = true; } // int64->int32 if (node->Var()->GetDataType() == proto::VarType::INT64) { @@ -54,44 +55,63 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const { } // temp scope for shape inference - std::shared_ptr scope( - new paddle::framework::Scope()); - for (auto node : graph->Nodes()) { - if (!node->IsVar()) { - continue; - } - auto var_desc = node->Var(); - auto* ptr = scope->Var(var_desc->Name()); - paddle::framework::InitializeVariable(ptr, var_desc->GetType()); + if (need_infer_shape) { + std::shared_ptr scope( + new paddle::framework::Scope()); + for (auto node : graph->Nodes()) { + if (!node->IsVar()) { + continue; + } + auto var_desc = node->Var(); + auto* ptr = scope->Var(var_desc->Name()); + paddle::framework::InitializeVariable(ptr, var_desc->GetType()); - auto tensor = ptr->GetMutable(); - tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape())); - } + auto tensor = ptr->GetMutable(); + tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape())); + } - // infer shape - auto nodes = ir::TopologySortOperations(*graph); - for (auto node : nodes) { - auto op_desc = node->Op(); - auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope); - op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx); + // infer shape + auto nodes = ir::TopologySortOperations(*graph); + for (auto node : nodes) { + VLOG(10) << "InferShapePass: Infer shape for Op (" << node->Name() << ")"; + auto op_desc = node->Op(); + if (op_desc->Type() == "popart_optimizer") { + continue; + } + auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); + paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), + *scope); + op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx); - for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) { - for (int i = 0; i < it->second.size(); i++) { - auto output_name = op_desc->Output(it->first)[i]; - auto dim = - it->second[i]->GetMutable()->dims(); - auto new_shape = paddle::framework::vectorize(dim); - for (auto output_node : node->outputs) { - if (output_node->Name() == output_name) { - output_node->Var()->SetShape(new_shape); + for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) { + for (int i = 0; i < it->second.size(); i++) { + auto output_name = op_desc->Output(it->first)[i]; + auto dim = + it->second[i]->GetMutable()->dims(); + auto new_shape = paddle::framework::vectorize(dim); + for (auto output_node : node->outputs) { + if (output_node->Name() == output_name) { + output_node->Var()->SetShape(new_shape); + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "InferShapePass: output[" << output_node->Name() + << "], infer shape:["; + for (auto s : new_shape) { + sout << std::to_string(s) << ", "; + } + sout << "]"; + VLOG(10) << sout.str(); + } + } } } } + VLOG(10) << "InferShapePass: Infer shape for Op (" << node->Name() + << ") finished"; } + // release the temp scope + scope.reset(); } - // release the temp scope - scope.reset(); VLOG(10) << "Post Graph: "; VLOG(10) << DebugString(graph); diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h index 3e8148b7f066d9e6f3d4f2bd34b1d561bf4293a8..e2d70ff40ab3448c42aa9769d52fe02d56690438 100644 --- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h +++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class InferShapePass : public IPUPassBase { +class InferShapePass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h index e80e1905d4ad79c1df1caa4f9e8bd463dfa6654e..0bcbaef83f992f1fea9757321411b2c7f55163fd 100644 --- a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h +++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class InferencePostprocessPass : public IPUPassBase { +class InferencePostprocessPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc index d02dcce0cc62c8d96cab1ace860c7fbe913a6e10..05490773301649f1cc172a8e56ca18e4205a7a10 100644 --- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc @@ -29,8 +29,7 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter InferenceProcessPass::ApplyImpl"; // Get a new instance of ipu_backend - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetNewInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); // Set scope auto& scope = graph->Get(kParamScopeAttr); @@ -40,18 +39,34 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { static std::shared_ptr ipu_strategy_instance_( new platform::ipu::IpuStrategy()); ipu_strategy_instance_->is_training = false; + // Set graph replication + auto replica_num = graph->Get("replica_num"); + if (replica_num > 1) { + ipu_strategy_instance_->popart_options.enableReplicatedGraphs = true; + ipu_strategy_instance_->popart_options.replicatedGraphCount = replica_num; + } + // Set the num of IPUs auto num_ipus = graph->Get("num_ipus"); - ipu_strategy_instance_->num_ipus = num_ipus; + // Set sharding if (num_ipus > 1) { - ipu_strategy_instance_->popart_options_.virtualGraphMode = + ipu_strategy_instance_->need_avg_shard = true; + ipu_strategy_instance_->popart_options.virtualGraphMode = platform::ipu::VirtualGraphMode::Manual; } else { - ipu_strategy_instance_->popart_options_.virtualGraphMode = + ipu_strategy_instance_->need_avg_shard = false; + ipu_strategy_instance_->popart_options.virtualGraphMode = platform::ipu::VirtualGraphMode::Off; } + // total num IPUs = num_ipus * replica_num + ipu_strategy_instance_->num_ipus = num_ipus * replica_num; + + // Set micro_batch_size for shape inference + ipu_strategy_instance_->micro_batch_size = + graph->Get("micro_batch_size"); + // Set pipelining auto enable_pipelining = graph->Get("enable_pipelining"); - ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining; + ipu_strategy_instance_->popart_options.enablePipelining = enable_pipelining; if (enable_pipelining) { auto batches_per_step = graph->Get("batches_per_step"); PADDLE_ENFORCE_GE( @@ -60,8 +75,20 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { "greater than the number of IPUs")); ipu_strategy_instance_->batches_per_step = batches_per_step; } - ipu_strategy_instance_->batch_size = graph->Get("batch_size"); - ipu_strategy_instance_->need_avg_shard = graph->Get("need_avg_shard"); + + // Set FP16 + auto enable_fp16 = graph->Get("enable_fp16"); + ipu_strategy_instance_->enable_fp16 = enable_fp16; + if (enable_fp16) { + auto enable_half_partial = graph->Get("enable_half_partial"); + if (enable_half_partial) { + ipu_strategy_instance_->popart_options.partialsTypeMatMuls = "half"; + } + } + + // Set available memory proportion for matmul/conv + ipu_strategy_instance_->available_memory_proportion = + graph->Get("available_memory_proportion"); ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get())); @@ -94,9 +121,9 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { } // Run passes - std::vector graph_pass = {"forward_graph_extract_pass", - "infer_shape_pass", "avg_shard_pass", - "popart_canonicalization_pass"}; + std::vector graph_pass = { + "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass", + "popart_canonicalization_pass", "transfer_cast_op_pass"}; std::vector compile_pass = { "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass", "inference_postprocess_pass"}; diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.h b/paddle/fluid/framework/ir/ipu/inference_process_pass.h index bac0e88377f7c6d8ef124817256c5059345f714e..bd6c9d1914e8c7894aa2f4ee369662f26a507cfd 100644 --- a/paddle/fluid/framework/ir/ipu/inference_process_pass.h +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class InferenceProcessPass : public IPUPassBase { +class InferenceProcessPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc index 5a53466089bc88fe25b3aaed54736f8e78994513..17269ac8889c74ad3396cc789091cf12d69f73f9 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc +++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc @@ -32,8 +32,7 @@ void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const { std::vector fetch_list; fetch_list = Get>("fetch_list"); - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); ipu_backend->Compile(graph, feed_list, fetch_list); diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h index 6237df36480335e3971aaffe120ca2374966fcb3..7f60cf6a34bad9d29443e9080a0cd8bee669e7a2 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h +++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h @@ -15,13 +15,13 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuGraphBuilderPass : public IPUPassBase { +class IpuGraphBuilderPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h index 86756276c8c3dcf6c010ad05b9a39bfa3bda71f5..8c42d5db1af8d8798beeed79816e4f2b245331df 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h +++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuInplacePass : public IPUPassBase { +class IpuInplacePass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc index a3e020714e1db90154b90b3785ea99e9eabd3256..65ebd3ec8080d96ff0110ea66c7e1006b71b624f 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc +++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc @@ -56,19 +56,6 @@ void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const { } } - // set ipu_runtime_op dtype attr - if (fetch_list.size() == 1) { - for (auto* node : graph->Nodes()) { - if (node->IsVar()) { - for (auto fetch : fetch_list) { - if (node->Name() == fetch) { - ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType()); - } - } - } - } - } - // Remove unneeded nodes. std::unordered_set marked_nodes; for (auto* node : graph->Nodes()) { diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h index ba2cc8702fa4731a388986bee8436e6bff31c586..993b21e943db297a927f1978bdf065ddc7b949ed 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h +++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h @@ -15,13 +15,13 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuRuntimeReplacerPass : public IPUPassBase { +class IpuRuntimeReplacerPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index c6be2c775bd2116e11e24d17da225d2c76679399..3d8d353cbf530ebe9cc9ea90937b9acf5ddd4a0f 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -15,72 +15,303 @@ #include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -#include "paddle/fluid/platform/device/ipu/ipu_backend.h" namespace paddle { namespace framework { namespace ir { -void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { - VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl"; - VLOG(10) << "Raw Graph: "; - VLOG(10) << DebugString(graph); +std::set ignored_ops = { + "sign", + "sum", + "clip", + "clip_by_norm", + "square", + "reduce_sum", + "sqrt", + "elementwise_max", + "elementwise_div", + "elementwise_mul", + "scale", // adamax + "assign", // adamw +}; + +const bool startswith(const std::string& str, const std::string& pre) { + if (str.rfind(pre, 0) == 0) { + return true; + } else { + return false; + } +} + +const bool is_grad_clip_op(const std::string& op_namescope) { + return startswith(op_namescope, "/gradient_clip"); +} - auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance(); +const bool is_optimizer_op(const std::string& op_namescope) { + return startswith(op_namescope, "/optimizer"); +} + +const bool is_regularization_op(const std::string& op_namescope) { + return startswith(op_namescope, "/regularization"); +} +void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { + // 这里构建的 op 符合 popart 的定义, 涉及到的一些值需要在 LowerOptimier 时获得 + OpDesc new_op("popart_optimizer", {}, {}, {}); + new_op.SetAttr("op_role", 0); + new_op.SetAttr("with_lr_sched", false); + + std::set set_ops{}; + // use map store ? for (auto* node : graph->Nodes()) { - if (node->IsOp() && node->Op()) { - int op_role = BOOST_GET_CONST( - int, node->Op()->GetAttr( - framework::OpProtoAndCheckerMaker::OpRoleAttrName())); - - // graph usually have multiple optimizer node for different parameter, - // and these node have the same type and attr value usually - if ((op_role == static_cast(framework::OpRole::kOptimize))) { - ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type()); - VLOG(10) << "found optimizer type: " << node->Op()->Type(); - - for (const std::string& attr_name : node->Op()->AttrNames()) { - auto attr_type = node->Op()->GetAttrType(attr_name); - // with adam, attr are float - if (attr_type == proto::AttrType::FLOAT) { - auto attr_value = - BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name)); - ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value); - } else { - VLOG(10) << "Skip " << attr_type; - } - } + if (!node->IsOp()) { + continue; + } - auto lr_var_name = node->Op()->Input("LearningRate"); - PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u, - platform::errors::InvalidArgument( - "In op(%s), find input(LearningRate) failed.", - node->Op()->Type())); + auto op = node->Op(); + auto op_type = op->Type(); + int op_role_ = BOOST_GET_CONST( + int, op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + auto op_role = static_cast(op_role_); - ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]); + if (op_role == OpRole::kOptimize) { + if (set_ops.count(op_type)) { + continue; } - if ((op_role == static_cast(framework::OpRole::kLoss))) { - VLOG(10) << "found loss op type: " << node->Op()->Type(); - auto outputs = node->Op()->Outputs(); - PADDLE_ENFORCE_EQ( - outputs.size(), 1, - platform::errors::InvalidArgument("Can only support one loss key")); - - auto losses_name = outputs.begin()->second; - PADDLE_ENFORCE_EQ(losses_name.size(), 1, - platform::errors::InvalidArgument( - "Can only support one loss name")); + auto op_namescope = + BOOST_GET_CONST(std::string, op->GetAttr("op_namescope")); + bool is_grad_clip = is_grad_clip_op(op_namescope); + // bool is_optimizer = is_optimizer_op(op_namescope); + bool is_regularization = is_regularization_op(op_namescope); - ipu_backend->GetExecutor().SetLoss(losses_name[0]); + VLOG(10) << "found optimizer releated op: " << op_type; + // initial larning_rate will be set in LowerOptimier + set_ops.insert(op_type); + if (op_type == "sgd") { + auto type = std::string{"sgd"}; + auto lr_var = op->Input("LearningRate").front(); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("momentum", 0.0f); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "momentum") { + auto type = std::string{"sgd"}; + // auto LearningRate = op->Input("LearningRate"); + auto use_nesterov = BOOST_GET_CONST(bool, op->GetAttr("use_nesterov")); + PADDLE_ENFORCE_EQ(use_nesterov, false, + platform::errors::Unimplemented( + "ipu does not support nesterov mode.")); + auto regularization_method = + BOOST_GET_CONST(std::string, op->GetAttr("regularization_method")); + PADDLE_ENFORCE_NE(regularization_method, "l1_decay", + platform::errors::Unimplemented( + "ipu does not support l1_decay mode.")); + auto multi_precision = + BOOST_GET_CONST(bool, op->GetAttr("multi_precision")); + PADDLE_ENFORCE_EQ(multi_precision, false, + platform::errors::Unimplemented( + "ipu does not support multi_precision mode.")); + auto rescale_grad = BOOST_GET_CONST(float, op->GetAttr("rescale_grad")); + PADDLE_ENFORCE_EQ(rescale_grad, 1.0, + platform::errors::Unimplemented( + "ipu does not support rescale_grad mode.")); + auto regularization_coeff = + BOOST_GET_CONST(float, op->GetAttr("regularization_coeff")); + auto lr_var = op->Input("LearningRate").front(); + auto momentum = BOOST_GET_CONST(float, op->GetAttr("mu")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("momentum", momentum); + new_op.SetAttr("weight_decay", regularization_coeff); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "adam" || op_type == "adamw") { + auto type = std::string{"adam"}; + auto lr_var = op->Input("LearningRate").front(); + auto beta1 = BOOST_GET_CONST(float, op->GetAttr("beta1")); + auto beta2 = BOOST_GET_CONST(float, op->GetAttr("beta2")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + auto lazy_mode = BOOST_GET_CONST(bool, op->GetAttr("lazy_mode")); + auto multi_precision = + BOOST_GET_CONST(bool, op->GetAttr("multi_precision")); + PADDLE_ENFORCE_EQ(lazy_mode, false, + platform::errors::Unimplemented( + "ipu does not support lazy_mode mode.")); + PADDLE_ENFORCE_EQ(multi_precision, false, + platform::errors::Unimplemented( + "ipu does not support multi_precision mode.")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("beta1", beta1); + new_op.SetAttr("beta2", beta2); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("adam_mode", std::string{"adam"}); + // adam or adamw + if (op_type == "adam") { + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", std::string{"adam"}); + } else { + new_op.SetAttr("weight_decay_mode", std::string{"decay"}); + new_op.SetAttr("raw_type", std::string{"adamw"}); + } + } else if (op_type == "adamax") { + auto type = std::string{"adam"}; + auto lr_var = op->Input("LearningRate").front(); + auto beta1 = BOOST_GET_CONST(float, op->GetAttr("beta1")); + auto beta2 = BOOST_GET_CONST(float, op->GetAttr("beta2")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("beta1", beta1); + new_op.SetAttr("beta2", beta2); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("adam_mode", std::string{"adamax"}); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "lamb") { + // use decay mode + auto type = std::string{"adam"}; + auto lr_var = op->Input("LearningRate").front(); + auto weight_decay = BOOST_GET_CONST(float, op->GetAttr("weight_decay")); + auto beta1 = BOOST_GET_CONST(float, op->GetAttr("beta1")); + auto beta2 = BOOST_GET_CONST(float, op->GetAttr("beta2")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", weight_decay); + new_op.SetAttr("beta1", beta1); + new_op.SetAttr("beta2", beta2); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("adam_mode", std::string{"lamb"}); + new_op.SetAttr("weight_decay_mode", std::string{"decay"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "adadelta") { + // NO LearningRate + auto type = std::string{"adaptive"}; + auto rho = BOOST_GET_CONST(float, op->GetAttr("rho")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("alpha", rho); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("momentum", 0.0f); + new_op.SetAttr("adaptive_mode", std::string{"adadelta"}); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "adagrad") { + auto type = std::string{"adaptive"}; + auto lr_var = op->Input("LearningRate").front(); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + new_op.SetAttr("type", type); + new_op.SetAttr("lr_var", lr_var); + new_op.SetAttr("weight_decay", 0.0f); + // `alpha` use default + new_op.SetAttr("alpha", 0.99f); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("momentum", 0.0f); + new_op.SetAttr("adaptive_mode", std::string{"adagrad"}); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + new_op.SetAttr("raw_type", op_type); + } else if (op_type == "rmsprop") { + auto type = std::string{"adaptive"}; + auto lr_var = op->Input("LearningRate").front(); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + auto decay = BOOST_GET_CONST(float, op->GetAttr("decay")); + auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum")); + auto centered = BOOST_GET_CONST(bool, op->GetAttr("centered")); + new_op.SetAttr("type", type); + new_op.SetAttr("weight_decay", 0.0f); + new_op.SetAttr("alpha", decay); + new_op.SetAttr("eps", epsilon); + new_op.SetAttr("momentum", momentum); + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); + if (centered) { + new_op.SetAttr("adaptive_mode", std::string{"centered_rmsprop"}); + new_op.SetAttr("raw_type", op_type); + } else { + new_op.SetAttr("adaptive_mode", std::string{"rmsprop"}); + new_op.SetAttr("raw_type", op_type); + } + } else if (is_regularization && op_type == "scale") { + // set weight_decay for L2Decay + auto scale = BOOST_GET_CONST(float, op->GetAttr("scale")); + new_op.SetAttr("weight_decay", scale); + } else if (is_grad_clip && op_type == "fill_constant") { + // set clip_norm for ClipGradByGlobalNorm + auto value = BOOST_GET_CONST(float, op->GetAttr("value")); + new_op.SetAttr("clip_norm", value); + } else if (ignored_ops.count(op_type)) { + VLOG(10) << "Ignore optimizer releated op: " << op_type; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unknown optimizer releated op_type: %s", op_type)); } + } else if (op_role == OpRole::kLoss) { + VLOG(10) << "found loss op type: " << op->Type(); + auto outputs = op->Outputs(); + PADDLE_ENFORCE_EQ( + outputs.size(), 1, + platform::errors::InvalidArgument("Can only support one loss key")); + auto losses = outputs.begin()->second; + PADDLE_ENFORCE_EQ( + losses.size(), 1, + platform::errors::InvalidArgument("Can only support one loss name")); + auto loss_var = losses.front(); + new_op.SetAttr("loss_var", loss_var); + } else if (op_role == OpRole::kLRSched) { + // op_role == OpRole::kLRSched | OpRole::kOptimize + new_op.SetAttr("with_lr_sched", true); + } + } + + // seems with_lr_sched is always true + new_op.SetAttr("with_lr_sched", true); + + // setup weight deacy + // weight_decay/coeff is "scale" attr of scale_op + if (set_ops.count("scale") && set_ops.count("sum")) { + if (set_ops.count("sign")) { + // L1Decay + // sign + scale + sum + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported L1Decay regularizer")); + } else { + // L2Decay + // scale + sum + new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"}); } + } else { + VLOG(10) << "No weight deacy setting found"; + } + + // setup grad clip + if (set_ops.count("clip")) { + // ClipGradByValue + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported ClipGradByValue")); + } else if (set_ops.count("clip_by_norm")) { + // ClipGradByNorm + PADDLE_THROW(platform::errors::Unimplemented("Unsupported ClipGradByNorm")); } - VLOG(10) << "Post Graph: "; - VLOG(10) << DebugString(graph); - VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl"; + // ClipGradByGlobalNorm + // use graph pattern match ClipGradByGlobalNorm + // square + reduce_sum + sum + sqrt + fill_constant + // + elementwise_max + elementwise_div + elementwise_mul + // clip_norm from fill_constant`s attr `value` dtype float + + if (new_op.HasAttr("type")) { + auto new_node = graph->CreateOpNode(&new_op); + VLOG(10) << "New Optimizer Node:"; + VLOG(10) << DebugString(new_node); + } else { + PADDLE_THROW(platform::errors::NotFound( + "No optimizer found, optimizer must be one of these types: sgd, " + "momentum, adam, adamw, adamax, lamb, adadelta, adagrad or rmsprop")); + } } } // namespace ir diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h index fd274ded8f5bd1641bd1eb4e6999a6fa38dca090..8b015fdc4f80288433f592b41f394576ef9b46ff 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h @@ -14,14 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class IpuOptimizerExtractPass : public IPUPassBase { +class IpuOptimizerExtractPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc index c23bfdcb154f16a933c389d9d9032364995bd58c..4da913e7176ca3bf36b05514b5a6854e6fde806a 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc @@ -14,23 +14,19 @@ #include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -#include "paddle/fluid/platform/device/ipu/common.h" #include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_names.h" namespace paddle { namespace framework { namespace ir { -using paddle::platform::ipu::IpuBackend; -using framework::ir::Graph; -using framework::ir::Node; - void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl"; VLOG(10) << "Raw Graph: "; VLOG(10) << DebugString(graph); - auto ipu_backend = IpuBackend::GetInstance(); + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); const auto* scope_ = ipu_backend->GetScope(); for (auto* node : graph->Nodes()) { diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h index 21a1017d88452aa950949c44a862b78c11bc5793..96aca44d84a8acbf54ba134017141554e7855736 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h +++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { @@ -26,7 +26,7 @@ namespace ir { * include Adam/Lamb. */ -class IpuOptimizerStateAlignPass : public IPUPassBase { +class IpuOptimizerStateAlignPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc index d2d76f9a9a2f92762f77ebc5e5f1ecb7d4d55bcb..975a4b62cc708859803a2137741caaf413e50210 100644 --- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc @@ -21,15 +21,13 @@ namespace paddle { namespace framework { namespace ir { -using framework::ir::Graph; -using framework::ir::Node; -using platform::ipu::SymbolHandler; - void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl"; VLOG(10) << "Raw Graph: "; VLOG(10) << DebugString(graph); + auto custom_ops = Get>("custom_ops"); + std::vector missing_ops; auto nodes = graph->Nodes(); for (auto* node : nodes) { if (!node->IsOp()) { @@ -39,21 +37,40 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { auto op_type = op->Type(); ir::Node* new_node = nullptr; - SymbolHandler handler = platform::ipu::GetHandler(op_type); + platform::ipu::SymbolHandler handler = platform::ipu::GetHandler(op_type); + if (!handler && !custom_ops.empty()) { + if (custom_ops.count(op_type)) { + VLOG(10) << "Found custom op: " << op_type; + handler = platform::ipu::GetHandler("custom_op"); + } + } + if (handler) { VLOG(11) << "Raw Paddle Node:"; VLOG(11) << node->Op()->Proto()->DebugString(); new_node = handler(graph, node); - VLOG(11) << "Post Popart Node:"; - VLOG(11) << new_node->Op()->Proto()->DebugString(); - - platform::ipu::ClearNode(node); - graph->RemoveNode(node); + if (new_node) { + VLOG(11) << "Post Popart Node:"; + VLOG(11) << new_node->Op()->Proto()->DebugString(); + platform::ipu::ClearNode(node); + graph->RemoveNode(node); + } } else { - LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type; + missing_ops.push_back(op_type); } } + if (!missing_ops.empty()) { + LOG(ERROR) << "Can not find OpHandler for op_type: "; + for (auto& op_type : missing_ops) { + LOG(ERROR) << op_type; + } + PADDLE_THROW(platform::errors::Unimplemented( + "Found unimplemented op_handler(s) for IPU")); + } + + // post popart_canonicalization + VLOG(10) << "Post Graph: "; VLOG(10) << DebugString(graph); VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl"; @@ -64,4 +81,5 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { } // namespace paddle REGISTER_PASS(popart_canonicalization_pass, - paddle::framework::ir::PopartCanonicalizationPass); + paddle::framework::ir::PopartCanonicalizationPass) + .DefaultPassAttr("custom_ops", new std::unordered_set{}); diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h index 6690873f2a9ac2532f02b59b241a2858dc978beb..b01453ea3668a120d43d29b8193e7643be4b6ae2 100644 --- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class PopartCanonicalizationPass : public IPUPassBase { +class PopartCanonicalizationPass : public Pass { protected: void ApplyImpl(ir::Graph* graph) const override; };