From d67fe921f41689d5f34bacd01de33139e519a969 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Mon, 21 Mar 2022 10:29:38 +0800 Subject: [PATCH] [IPU] update ipu_backend (#40685) * sync changes * copy sOpNamescope * fix UTs * add authors Co-authored-by: Xiaobing Wang Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen Co-authored-by: Han Zhao * fix code-format * fix compile error * add comments for feed_op Co-authored-by: Xiaobing Wang Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen Co-authored-by: Han Zhao --- .../ir/ipu/optimizer_extract_pass.cc | 24 +- .../framework/ir/ipu/transfer_cast_op_pass.cc | 3 +- paddle/fluid/framework/tensor_util.cc | 80 ++++-- paddle/fluid/operators/controlflow/feed_op.cc | 7 + .../fluid/platform/device/ipu/CMakeLists.txt | 2 +- .../fluid/platform/device/ipu/ipu_backend.cc | 24 +- .../fluid/platform/device/ipu/ipu_backend.h | 18 +- .../fluid/platform/device/ipu/ipu_compiler.cc | 252 ++++++++++++++++-- .../fluid/platform/device/ipu/ipu_compiler.h | 13 + .../fluid/platform/device/ipu/ipu_executor.cc | 76 +++--- .../fluid/platform/device/ipu/ipu_executor.h | 18 +- paddle/fluid/platform/device/ipu/ipu_names.h | 2 + .../fluid/platform/device/ipu/ipu_strategy.cc | 42 ++- .../fluid/platform/device/ipu/ipu_strategy.h | 53 +++- .../ipu/popart_canonicalization/op_builder.cc | 30 +-- paddle/fluid/pybind/pybind.cc | 10 + .../unittests/ipu/test_ipu_strategy_ipu.py | 16 +- .../tests/unittests/ipu/test_save_load_ipu.py | 12 +- 18 files changed, 507 insertions(+), 175 deletions(-) diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index 9fe50deaf2..7cdb7a8854 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -25,14 +25,14 @@ std::set ignored_ops = { "sum", "clip", "clip_by_norm", - "square", "reduce_sum", "sqrt", "elementwise_max", "elementwise_div", "elementwise_mul", - "scale", // adamax - "assign", // adamw + "scale", // adamax + "assign", // adamw + "squared_l2_norm" // gradient_clip_norm }; const bool startswith(const std::string& str, const std::string& pre) { @@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { new_op.SetAttr("with_lr_sched", false); std::set set_ops{}; + // save the weight decay tensor_name and weight_decay_value for Lamb + std::vector weight_decay_vars{}; + std::vector weight_decay_values{}; + // use map store ? for (auto* node : graph->Nodes()) { if (!node->IsOp()) { @@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { auto op_role = static_cast(op_role_); if (op_role == OpRole::kOptimize) { + // save weight decay value from every lamb optimizer op + if (op_type == "lamb" && op->HasAttr("weight_decay")) { + auto weight_decay_value = + BOOST_GET_CONST(float, op->GetAttr("weight_decay")); + auto params = op->Output("ParamOut"); + weight_decay_vars.push_back(params[0]); + weight_decay_values.push_back(weight_decay_value); + } + if (set_ops.count(op_type)) { continue; } @@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { // seems with_lr_sched is always true new_op.SetAttr("with_lr_sched", true); - // setup weight deacy + // setup weight decay for Lamb + new_op.SetAttr("weight_decay_vars", weight_decay_vars); + new_op.SetAttr("weight_decay_values", weight_decay_values); + // weight_decay/coeff is "scale" attr of scale_op if (set_ops.count("scale") && set_ops.count("sum")) { if (set_ops.count("sign")) { diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc index e754ba72ad..5cd8358dc0 100644 --- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc +++ b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc @@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const { auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16; - if (enable_fp16) { + auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op; + if (enable_fp16 && transfer_cast_op) { for (auto* node : graph->Nodes()) { if (node->IsOp() && node->Op()->Type() == "popart_cast") { if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) == diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 5de8612354..e8cd84248e 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_IPU - else if (platform::is_ipu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_cpu_place(src_place) && - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_ipu_place(src_place) && - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { @@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, "Copying from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_IPU + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copying from %s to %s is not supported.", src_place, dst_place)); + } +#endif } template @@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_IPU - else if (platform::is_ipu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_ipu_place(dst_place)) { - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else { // NOLINT - PADDLE_THROW(platform::errors::Unimplemented( - "Copy from %s to %s is not supported.", src_place, dst_place)); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { /* custom_device -> cpu*/ memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } + } // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/ memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } + } // NOLINT else if (platform::is_custom_place(src_place) && // NOLINT platform::is_custom_place( dst_place)) { /* custom_device -> custom_device*/ @@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } + } // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } + } // NOLINT else if (platform::is_xpu_place(src_place) && // NOLINT platform::is_xpu_place(dst_place)) { if (src_ptr == dst_ptr) { @@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place); xpu_ctx->Wait(); } - } + } // NOLINT else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( "Copy from %s to %s is not supported.", src_place, dst_place)); @@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_IPU + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else if (platform::is_ipu_place(src_place) && // NOLINT + platform::is_ipu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif } template diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index bc29c92b09..8a190c1a1e 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor { out_var_->GetMutable(); if (platform::is_same_place(in_tensor.place(), place_)) { out_tensor->ShareDataWith(in_tensor); +#ifdef PADDLE_WITH_IPU + } else if (platform::is_ipu_place(place_)) { + // For ipu, both in_tensor and out_tensor are allocated on cpu, + // PopART will copy tensor from host automatically, + // no TensorCopy() is required here. + out_tensor->ShareDataWith(in_tensor); +#endif } else { platform::DeviceContext *context = platform::DeviceContextPool::Instance().Get(place_); diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt index acf914c508..42c949f7fe 100644 --- a/paddle/fluid/platform/device/ipu/CMakeLists.txt +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -13,7 +13,7 @@ IF(WITH_IPU) "ipu_device.cc" ) - cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper) + cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist) cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce) add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC}) add_dependencies(paddle_ipu ipu_backend) diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc index e0b3b08a23..012294d0ff 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.cc +++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc @@ -32,6 +32,7 @@ IpuBackend* IpuBackend::GetInstance() { IpuBackend::IpuBackend() { compiler_ = std::make_unique(); executor_ = std::make_unique(); + timer_ = std::make_unique(); } IpuBackend::~IpuBackend() { @@ -43,6 +44,7 @@ void IpuBackend::Compile(Graph* graph, const std::vector& feed_list, const std::vector& fetch_list) { VLOG(10) << "enter IpuBackend::Compile"; + is_compiled_ = false; compiler_->Prepare(graph); compiler_->InitInputs(feed_list); compiler_->LowerConstants(scope_); @@ -52,31 +54,25 @@ void IpuBackend::Compile(Graph* graph, if (ipu_strategy_->is_training) { compiler_->LowerOptimizer(scope_); } + if (!ipu_strategy_->onnx_dump_path.empty()) { + SaveModelProto(ipu_strategy_->onnx_dump_path); + } executor_->SetCompilerResources(compiler_->GetResources()); - + executor_->Prepare(compiler_->GetModelProto()); is_compiled_ = true; - // when call compile, means a new graph - is_prepared_ = false; VLOG(10) << "leave IpuBackend::Compile"; } void IpuBackend::Run(const std::vector& inputs, const std::vector& outputs, const framework::ExecutionContext& ctx) { - Prepare(); timer_->Start(); executor_->Run(inputs, outputs, ctx); timer_->Pause(); VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)"; } -void IpuBackend::Prepare() { - if (!is_prepared_) { - executor_->Prepare(compiler_->GetModelProto()); - timer_.reset(new platform::Timer()); - is_prepared_ = true; - } -} +void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); } void IpuBackend::Detach() { executor_->Detach(); } @@ -101,12 +97,10 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) { } void IpuBackend::SaveModelProto(const std::string& path) { - if (ipu_strategy_->is_training && is_prepared_) { + if (ipu_strategy_->is_training && is_compiled_) { executor_->SaveModelToHost(path); - } else if (is_compiled_) { - compiler_->SaveModelProtoNoCheck(path); } else { - LOG(WARNING) << "Model is empty"; + compiler_->SaveModelProtoNoCheck(path); } } diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h index 1244192490..0578d9face 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.h +++ b/paddle/fluid/platform/device/ipu/ipu_backend.h @@ -60,6 +60,9 @@ class IpuBackend { const std::vector &outputs, const framework::ExecutionContext &ctx); + // Sync weights from IPU while training + void WeightsToHost(); + // detach IPU manually void Detach(); @@ -76,22 +79,17 @@ class IpuBackend { void SaveModelProto(const std::string &path); private: - void Prepare(); - - private: - std::unique_ptr compiler_; - std::unique_ptr executor_; - bool is_compiled_ = false; - bool is_prepared_ = false; - // not own const Scope *scope_ = nullptr; const IpuStrategy *ipu_strategy_ = nullptr; - private: - // time record for IpuBackend::Run + // own + std::unique_ptr compiler_; + std::unique_ptr executor_; std::unique_ptr timer_; + bool is_compiled_ = false; + DISABLE_COPY_AND_ASSIGN(IpuBackend); }; diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index cdb3f6f9b3..1a3e600058 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -18,6 +18,7 @@ #include #include #include + #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/device/ipu/ipu_utils.h" @@ -25,13 +26,20 @@ namespace paddle { namespace platform { namespace ipu { -popart::AdamMode AdamModeFromStr(const std::string& str) { +popart::AdamMode AdamModeFromStr(const std::string& str, + const bool& use_no_bias_optimizer) { if (str == "adam") { - return popart::AdamMode::Adam; + if (!use_no_bias_optimizer) + return popart::AdamMode::Adam; + else + return popart::AdamMode::AdamNoBias; } else if (str == "adamax") { return popart::AdamMode::AdaMax; } else if (str == "lamb") { - return popart::AdamMode::Lamb; + if (!use_no_bias_optimizer) + return popart::AdamMode::Lamb; + else + return popart::AdamMode::LambNoBias; } else { PADDLE_THROW(platform::errors::InvalidArgument( "Uknown AdamMode: %s, AdamMode must be one of these values: adam, " @@ -70,6 +78,17 @@ popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) { } } +popart::DataType DataTypeFromStr(const std::string& str) { + if (str == "FLOAT") { + return popart::DataType::FLOAT; + } else if (str == "FLOAT16") { + return popart::DataType::FLOAT16; + } else { + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported DataType: %s", str)); + } +} + template T GetAttrAllowNull(std::string attr, OpDesc* op_desc) { if (op_desc->HasAttr(attr)) { @@ -122,6 +141,17 @@ void Compiler::Prepare(const Graph* graph) { builder_ = popart::Builder::create(); resources_ = std::make_unique(); graph_helper_ = std::make_unique(graph); + // Set the flag of set_amp_for_all_ + for (auto* node : graph_helper_->sorted_ops) { + auto* op_desc = node->Op(); + auto op_type = op_desc->Type(); + if (op_type == "popart_matmul") { + if (op_desc->HasAttr(sAvailMemAttribute)) { + set_amp_for_all_ = false; + return; + } + } + } } void Compiler::RegisterOpFunc() { @@ -155,7 +185,9 @@ void Compiler::RegisterOpFunc() { auto debug_context = BuildDebugContext(op_desc); \ auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1(); \ auto aiOnnxOpset = builder_->aiOnnxOpset11(); \ + PushNameScope(op_desc); \ auto output_ids = OnnxImpl(inputs Args, debug_context); \ + PopNameScope(op_desc); \ SetIpuIndexStage(output_ids, op_desc); \ SetAMPAttributes(output_ids, op_desc); \ SetSerializeAttributes(output_ids, op_desc); \ @@ -241,7 +273,9 @@ void Compiler::LowerConstants(const Scope* scope) { popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()), shape); const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info)); + PushNameScope(op_desc); popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data); + PopNameScope(op_desc); SetIpuIndexStage(result, op_desc); resources_->tensors.emplace(tensor_name, result); } @@ -261,6 +295,10 @@ void Compiler::LowerWeights(const Scope* scope) { VLOG(10) << "found existed one, skip lowering Weight: " << var_name; continue; } + if (var_name.rfind("learning_rate", 0) == 0) { + VLOG(10) << "skip learning_rate_var: " << var_name; + continue; + } VLOG(10) << "lowering weight: " << var_name; auto var = scope->FindVar(var_name); @@ -273,10 +311,15 @@ void Compiler::LowerWeights(const Scope* scope) { } popart::TensorInfo tensor_info(dtype, shape); popart::ConstVoidData const_data{tensor.data(), tensor_info}; - popart::TensorId result = - builder_->addInitializedInputTensor(const_data, var_name); - resources_->tensors.emplace(var_name, result); - resources_->weights.push_back(result); + if (!node->outputs.empty()) { + auto op_node = node->outputs[0]; + PushNameScope(op_node->Op()); + popart::TensorId result = + builder_->addInitializedInputTensor(const_data, var_name); + PopNameScope(op_node->Op()); + resources_->tensors.emplace(var_name, result); + resources_->weights.push_back(var_name); + } } } } @@ -298,7 +341,10 @@ void Compiler::LowerBody() { } else if (op_type == "popart_checkpointoutput") { auto inputs = GetOpInputs(op_desc); auto outputs = GetOpOutputs(op_desc); + PushNameScope(op_desc); auto output_ids = builder_->checkpointOutput(inputs); + PopNameScope(op_desc); + SetIpuIndexStage(output_ids, op_desc); InsertTensors(outputs, output_ids); } else if (op_type == "popart_custom_op") { auto inputs = GetOpInputs(op_desc); @@ -313,9 +359,11 @@ void Compiler::LowerBody() { BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type")); VLOG(10) << "Build graph from custom op: " << __op_type; auto it = custom_ops_.find(__op_type); + PushNameScope(op_desc); auto output_ids = builder_->customOp(it->second.popart_op, it->second.popart_op.version, inputs, outputs.size(), attributes, debug_context); + PopNameScope(op_desc); SetIpuIndexStage(output_ids, op_desc); InsertTensors(outputs, output_ids); } else if (op_type == "popart_printtensor") { @@ -325,8 +373,10 @@ void Compiler::LowerBody() { auto print_gradient = BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient")); auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title")); + PushNameScope(op_desc); auto output_ids = builder_->aiGraphcoreOpset1().printtensor( inputs, print_gradient, debug_context, title); + PopNameScope(op_desc); SetIpuIndexStage(output_ids, op_desc); InsertTensors(outputs, output_ids); } else { @@ -367,8 +417,31 @@ void Compiler::LowerOptimizer(const Scope* scope) { resources_->with_lr_sched = false; } VLOG(10) << "Set initial lr: " << resources_->lr; - auto loss_scaling = ipu_strategy_->loss_scaling; + + // Get the type of optimizer auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type")); + // Set weight decay by tensor names for Lamb + auto weight_decay_vars = BOOST_GET_CONST( + std::vector, op_desc->GetAttr("weight_decay_vars")); + auto weight_decay_values = BOOST_GET_CONST( + std::vector, op_desc->GetAttr("weight_decay_values")); + // Get the maximum permissible value for gradient clipping + std::vector clip_norm_settings = {}; + if (op_desc->HasAttr("clip_norm")) { + auto clip_norm = BOOST_GET_CONST(float, op_desc->GetAttr("clip_norm")); + clip_norm_settings.push_back( + popart::ClipNormSettings::clipAllWeights(clip_norm)); + VLOG(10) << "Set the global gradient clipping with the maximum " + "permissible value: " + << clip_norm; + } + + // Values from ipu_strategy + auto loss_scaling = ipu_strategy_->loss_scaling; + auto accl1_type = DataTypeFromStr(ipu_strategy_->accl1_type); + auto accl2_type = DataTypeFromStr(ipu_strategy_->accl2_type); + auto accl3_type = DataTypeFromStr(ipu_strategy_->accl3_type); + if (type == "sgd") { auto weight_decay = BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); @@ -376,12 +449,18 @@ void Compiler::LowerOptimizer(const Scope* scope) { resources_->optimizer_fn = [=](float lr) { return std::make_unique( popart::OptimizerValue(lr, false), - popart::OptimizerValue(weight_decay, true), + popart::OptimizerValue(weight_decay, false), popart::OptimizerValue(momentum, true), popart::SGD::getUnsetDampening(), popart::SGD::getUnsetVelocityScaling(), - popart::OptimizerValue(loss_scaling, true)); + popart::OptimizerValue(loss_scaling, true), clip_norm_settings); }; + resources_->eval_optimizer = std::make_unique( + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(), + popart::SGD::getUnsetVelocityScaling(), + popart::OptimizerValue(loss_scaling, true), clip_norm_settings); } else if (type == "adam") { auto weight_decay = BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); @@ -392,22 +471,79 @@ void Compiler::LowerOptimizer(const Scope* scope) { VLOG(10) << "set max_weight_norm: " << mwn; auto adam_mode_ = BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode")); - auto adam_mode = AdamModeFromStr(adam_mode_); - auto weight_decay_mode_ = - BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); + auto adam_mode = + AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer); + auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode; + if (weight_decay_mode_.empty()) { + weight_decay_mode_ = BOOST_GET_CONST( + std::string, op_desc->GetAttr("weight_decay_mode")); + } auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); resources_->optimizer_fn = [=](float lr) { - return std::make_unique( - popart::OptimizerValue(lr, false), - popart::OptimizerValue(weight_decay, true), - popart::OptimizerValue(beta1, true), - popart::OptimizerValue(beta2, true), + if (adam_mode == popart::AdamMode::Lamb || + adam_mode == popart::AdamMode::LambNoBias) { + const std::map> + optimizer_value = {{"defaultLearningRate", {lr, false}}, + {"defaultBeta1", {beta1, false}}, + {"defaultBeta2", {beta2, false}}, + {"defaultEps", {eps, true}}, + {"lossScaling", {loss_scaling, true}}, + {"defaultMaxWeightNorm", {mwn, true}}}; + auto optimizer_instance = std::make_unique( + optimizer_value, adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, accl1_type, accl2_type, + clip_norm_settings); + for (int i = 0; i < weight_decay_vars.size(); i++) { + optimizer_instance->insertSpecific( + weight_decay_vars[i], + {{"weightDecay", {weight_decay_values[i], false}}}); + VLOG(10) << "Set Tensor " << weight_decay_vars[i] + << " weight decay as " << weight_decay_values[i]; + } + return optimizer_instance; + } else { + return std::make_unique( + popart::OptimizerValue(lr, false), + popart::OptimizerValue(weight_decay, false), + popart::OptimizerValue(beta1, false), + popart::OptimizerValue(beta2, false), + popart::OptimizerValue(eps, true), + popart::OptimizerValue(loss_scaling, true), + popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, accl1_type, accl2_type, + clip_norm_settings); + } + }; + if (adam_mode == popart::AdamMode::Lamb || + adam_mode == popart::AdamMode::LambNoBias) { + const std::map> optimizer_value = + {{"defaultLearningRate", {0.0, false}}, + {"defaultBeta1", {beta1, false}}, + {"defaultBeta2", {beta2, false}}, + {"defaultEps", {eps, true}}, + {"lossScaling", {loss_scaling, true}}, + {"defaultMaxWeightNorm", {mwn, true}}}; + auto eval_optimizer = std::make_unique( + optimizer_value, adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, popart::DataType::FLOAT, + popart::DataType::FLOAT, clip_norm_settings); + for (int i = 0; i < weight_decay_vars.size(); i++) { + eval_optimizer->insertSpecific(weight_decay_vars[i], + {{"weightDecay", {0.0, false}}}); + } + resources_->eval_optimizer = std::move(eval_optimizer); + } else { + resources_->eval_optimizer = std::make_unique( + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(beta1, false), + popart::OptimizerValue(beta2, false), popart::OptimizerValue(eps, true), popart::OptimizerValue(loss_scaling, true), popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, popart::DataType::UNDEFINED, popart::DataType::FLOAT, - popart::DataType::FLOAT); - }; + popart::DataType::FLOAT, clip_norm_settings); + } } else if (type == "adaptive") { auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha")); auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum")); @@ -417,21 +553,33 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto adaptive_mode_ = BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode")); auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_); - auto weight_decay_mode_ = - BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); + auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode; + if (weight_decay_mode_.empty()) { + weight_decay_mode_ = BOOST_GET_CONST( + std::string, op_desc->GetAttr("weight_decay_mode")); + } auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); resources_->optimizer_fn = [=](float lr) { return std::make_unique( popart::OptimizerValue(lr, false), - popart::OptimizerValue(weight_decay, true), + popart::OptimizerValue(weight_decay, false), popart::OptimizerValue(alpha, true), popart::OptimizerValue(momentum, true), popart::OptimizerValue(eps, true), popart::OptimizerValue(loss_scaling, true), adaptive_mode, - weight_decay_mode, popart::DataType::UNDEFINED, - popart::DataType::FLOAT, popart::DataType::FLOAT, - popart::DataType::FLOAT); + weight_decay_mode, popart::DataType::UNDEFINED, accl1_type, + accl2_type, accl3_type); }; + resources_->eval_optimizer = std::make_unique( + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(0.0, false), + popart::OptimizerValue(alpha, true), + popart::OptimizerValue(momentum, true), + popart::OptimizerValue(eps, true), + popart::OptimizerValue(loss_scaling, true), adaptive_mode, + weight_decay_mode, popart::DataType::UNDEFINED, + popart::DataType::FLOAT, popart::DataType::FLOAT, + popart::DataType::UNDEFINED); } else { PADDLE_THROW(platform::errors::Unimplemented( "optimizer %s is not implemented", type)); @@ -510,9 +658,32 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id, const OpDesc* op_desc) { VLOG(10) << "enter Compiler::SetAMPAttributes"; if (op_desc->Type() == "popart_matmul") { - auto amp = ipu_strategy_->available_memory_proportion; - if (amp > 0.0f && amp <= 1.0) { - builder_->setAvailableMemoryProportion(tensor_id, amp); + if (set_amp_for_all_) { + auto amp = ipu_strategy_->available_memory_proportion; + if (amp < 0.0f || amp > 1.0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "AvailableMemoryProportion %f is invalid, which should be set 0 <= " + "amp <= 1", + amp)); + } + if (amp > 0.0f) { + builder_->setAvailableMemoryProportion(tensor_id, amp); + } + } else { + if (op_desc->HasAttr(sAvailMemAttribute)) { + auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute)); + if (amp < 0.0f || amp > 1.0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "AvailableMemoryProportion %f is invalid, which should be set 0 " + "<= amp <= 1", + amp)); + } + if (amp > 0.0f) { + builder_->setAvailableMemoryProportion(tensor_id, amp); + VLOG(10) << "set available_memory_proportion for tensor: " + << tensor_id << " as " << amp; + } + } } } VLOG(10) << "leave Compiler::SetAMPAttributes"; @@ -602,6 +773,29 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) { return popart::DebugContext(op_identify_id); } +void Compiler::PushNameScope(const OpDesc* op) { + auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope)); + if (op_namescope == "/") { + return; + } + if (!op_namescope.empty()) { + op_namescope.pop_back(); + } + if (!op_namescope.empty()) { + op_namescope.erase(op_namescope.begin()); + } + VLOG(10) << "name_scope is: " << op_namescope; + builder_->pushNameScope(op_namescope); +} + +void Compiler::PopNameScope(const OpDesc* op) { + auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope)); + if (op_namescope == "/") { + return; + } + builder_->popNameScope(); +} + } // namespace ipu } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h index 5d1e8c2727..2d00970bf1 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.h +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h @@ -50,6 +50,8 @@ struct CompilerResources { using OptimizerFn = std::function(float lr)>; OptimizerFn optimizer_fn; + // The eval mode of optimizer in training + std::unique_ptr eval_optimizer; public: popart::Optimizer *Optimizer() { return optimizer.get(); } @@ -110,6 +112,7 @@ class Compiler { void RegisterOpFunc(); std::vector GetOpInputs(const OpDesc *op); const std::vector &GetOpOutputs(const OpDesc *op); + const std::string GetNameScope(const OpDesc *op); popart::DebugContext BuildDebugContext(const OpDesc *op); void InsertTensors(const std::vector &output_names, @@ -126,6 +129,8 @@ class Compiler { const OpDesc *op_desc); void SetSerializeAttributes(const std::string &tensor_id, const OpDesc *op_desc); + void PushNameScope(const OpDesc *op); + void PopNameScope(const OpDesc *op); private: std::unique_ptr builder_; @@ -137,6 +142,14 @@ class Compiler { const IpuStrategy *ipu_strategy_ = nullptr; std::map custom_ops_; + + // Used to choose the way to set amp for Ops + // If anyone op has the attr sAvailMemAttribute, the + // available_memory_proportion from ipu_strategy + // will be ignored and the Ops are set by their own sAvailMemAttribute. Else, + // all relevant Ops will be set by + // the available_memory_proportion from ipu_strategy. + bool set_amp_for_all_ = true; }; } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index c124d58957..649b291244 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -64,15 +64,10 @@ void Executor::Prepare(const std::string &proto) { WeightsFromPaddle(); VLOG(10) << "Copy weights from paddle to popart...done"; - VLOG(10) << "Copy weights from host to device..."; - session_->weightsFromHost(); - VLOG(10) << "Copy weights from host to device...done"; - - if (ipu_strategy_->save_init_onnx) { - session_->modelToHost("test_init.onnx"); + if (ipu_strategy_->random_seed != std::numeric_limits::max()) { + VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed; + session_->setRandomSeed(ipu_strategy_->random_seed); } - // init run step - step_ = 0; } void Executor::Run(const std::vector &inputs, @@ -120,11 +115,17 @@ void Executor::Run(const std::vector &inputs, VLOG(10) << "Prepared inputs/anchors"; if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) { - VLOG(10) << "Update learning_rate"; - auto new_lr = - GetSingleVarFromScope(scope_, compiler_resources_->lr_var); - VLOG(10) << "New Lr: " << new_lr; - auto *optimizer = compiler_resources_->UpdateOptimizer(new_lr); + popart::Optimizer *optimizer; + if (ipu_strategy_->runtime_options.enable_eval) { + VLOG(10) << "Switch optimizer to eval mode"; + optimizer = compiler_resources_->eval_optimizer.get(); + } else { + VLOG(10) << "Update learning_rate"; + auto new_lr = + GetSingleVarFromScope(scope_, compiler_resources_->lr_var); + VLOG(10) << "New Lr: " << new_lr; + optimizer = compiler_resources_->UpdateOptimizer(new_lr); + } auto *session = dynamic_cast(session_.get()); session->updateOptimizerFromHost(optimizer); } @@ -133,15 +134,13 @@ void Executor::Run(const std::vector &inputs, VLOG(10) << "Running..."; session_->run(stepio); VLOG(10) << "Running...done"; +} - step_++; - if (ipu_strategy_->is_training && - step_ % ipu_strategy_->save_per_n_step == 0) { - session_->weightsToHost(); +void Executor::WeightsToHost() { + if (ipu_strategy_->is_training && session_) { WeightsToPaddle(); - if (ipu_strategy_->save_onnx_checkpoint) { - session_->modelToHost("test_last" + std::to_string(step_) + ".onnx"); - } + } else { + LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU."; } } @@ -153,6 +152,7 @@ void Executor::AcquireDevice() { } bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + bool enable_distribution = ipu_strategy_->enable_distribution; if (use_ipu_model) { std::map deviceOpts{ { @@ -162,6 +162,16 @@ void Executor::AcquireDevice() { }; device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice( deviceOpts); + } else if (enable_distribution) { + auto ipus_per_replica = ipu_strategy_->num_ipus / + ipu_strategy_->popart_options.replicatedGraphCount; + auto device_id = popdist_get_device(ipus_per_replica); + device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById( + device_id); + PADDLE_ENFORCE_NOT_NULL( + device_, platform::errors::Unavailable( + "Can't attach IPU in distribution, ipu_num = %d.", + RequestIpus(ipu_strategy_->num_ipus))); } else { device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice( @@ -185,28 +195,29 @@ void Executor::SetWeightsIO() { auto opt_type = compiler_resources_->optimizer_type; VLOG(10) << "SetWeightsIO for " << opt_type; auto pre_post_fix = GetOptPrePostfix(opt_type); - for (const auto &weight_id : compiler_resources_->weights) { + for (const auto &weight_pd : compiler_resources_->weights) { for (const auto &pair : pre_post_fix) { // pair.first : popart prefix, pair.second : paddle postfix - auto popart_var_name = pair.first + weight_id; - auto paddle_var_name = weight_id + pair.second; + auto weight_pop = compiler_resources_->tensors[weight_pd]; + auto popart_var = pair.first + weight_pop; + auto paddle_var = weight_pd + pair.second; - if (scope_->FindVar(paddle_var_name) == nullptr) { + if (scope_->FindVar(paddle_var) == nullptr) { continue; } - - if (!session_->hasInfo(popart_var_name)) { + if (!session_->hasInfo(popart_var)) { continue; } - auto var = scope_->GetVar(paddle_var_name); + VLOG(10) << "Connect paddle weight: " << paddle_var + << " with popart weight: " << popart_var; + auto var = scope_->GetVar(paddle_var); auto data_ptr = var->GetMutable()->data(); - - auto tensor_info = session_->getInfo(popart_var_name); - executor_resources_->weights_io.insert(popart_var_name, + auto tensor_info = session_->getInfo(popart_var); + executor_resources_->weights_io.insert(popart_var, {data_ptr, tensor_info}); executor_resources_->weights_and_opt_state.emplace_back( - std::make_pair(popart_var_name, paddle_var_name)); + std::make_pair(popart_var, paddle_var)); } } } @@ -284,6 +295,7 @@ void Executor::ConvertWeights(bool align_to_popart) { void Executor::WeightsFromPaddle() { ConvertWeights(true); session_->writeWeights(executor_resources_->weights_io); + session_->weightsFromHost(); } // |-----------------------------------------------------| @@ -297,13 +309,13 @@ void Executor::WeightsFromPaddle() { // Paddle -> halfToFloat: cast then save to paddle // Popart -> Paddle: copy from paddle to popart void Executor::WeightsToPaddle() { + session_->weightsToHost(); session_->readWeights(executor_resources_->weights_io); ConvertWeights(false); } void Executor::SaveModelToHost(const std::string &path) { if (session_) { - session_->weightsToHost(); WeightsToPaddle(); session_->modelToHost(path); } else { diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h index b08b94b45f..c59e623ab2 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.h +++ b/paddle/fluid/platform/device/ipu/ipu_executor.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" @@ -36,8 +37,7 @@ struct ExecutorResources { // map popart::WeightsIO weights_io; // pairs, include weights and optimizer states - std::vector> - weights_and_opt_state; + std::vector> weights_and_opt_state; }; class Executor { @@ -53,14 +53,12 @@ class Executor { const std::vector &outputs, const framework::ExecutionContext &ctx); + // sync weights from popart to paddle + void WeightsToHost(); + // detach IPU void Detach(); - void SetWeightsIO(); - void ConvertWeights(bool align_to_popart); - void WeightsFromPaddle(); - void WeightsToPaddle(); - // Scope void SetScope(const Scope *scope) { scope_ = scope; } @@ -79,6 +77,10 @@ class Executor { private: void AcquireDevice(); + void SetWeightsIO(); + void ConvertWeights(bool); + void WeightsFromPaddle(); + void WeightsToPaddle(); private: // not own @@ -92,8 +94,6 @@ class Executor { std::unique_ptr session_; // one OneSession means a graph std::unique_ptr executor_resources_; - - int step_ = 0; }; } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_names.h b/paddle/fluid/platform/device/ipu/ipu_names.h index a809a8c6e5..b8a6ceffb5 100644 --- a/paddle/fluid/platform/device/ipu/ipu_names.h +++ b/paddle/fluid/platform/device/ipu/ipu_names.h @@ -24,6 +24,8 @@ static constexpr const char *sIpuIndexAttr = "ipu_index"; static constexpr const char *sIpuStageAttr = "ipu_stage"; static constexpr const char *sMatmulSerializeFactor = "serialize_factor"; static constexpr const char *sMatmulSerializeMode = "serialize_mode"; +static constexpr const char *sAvailMemAttribute = "__available_memory"; +static constexpr const char *sOpNamescope = "op_namescope"; static constexpr const char *sOpIdentifyIdAttr = "op_identify_id"; static constexpr const char *sDebugInfoId = "__debug_info_id"; diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index e806b0b30e..6172d4d7dc 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -62,23 +62,40 @@ IpuStrategy::IpuStrategy() { [&]() { return name; }) ADD_BOOL_OPTION(is_training); - ADD_BOOL_OPTION(save_init_onnx); - ADD_BOOL_OPTION(save_onnx_checkpoint); ADD_BOOL_OPTION(need_avg_shard); ADD_BOOL_OPTION(enable_fp16); + ADD_BOOL_OPTION(transfer_cast_op); + ADD_BOOL_OPTION(use_no_bias_optimizer); + ADD_BOOL_OPTION(enable_distribution); ADD_UINT64_OPTION(num_ipus); ADD_UINT64_OPTION(batches_per_step); ADD_UINT64_OPTION(micro_batch_size); - ADD_UINT64_OPTION(save_per_n_step); + ADD_UINT64_OPTION(random_seed); ADD_DOUBLE_OPTION(available_memory_proportion); ADD_DOUBLE_OPTION(loss_scaling); ADD_DOUBLE_OPTION(max_weight_norm); + ADD_STRING_OPTION(accl1_type); + ADD_STRING_OPTION(accl2_type); + ADD_STRING_OPTION(accl3_type); + ADD_STRING_OPTION(onnx_dump_path); + ADD_STRING_OPTION(weight_decay_mode); #undef ADD_STRING_OPTION #undef ADD_DOUBLE_OPTION #undef ADD_UINT64_OPTION #undef ADD_BOOL_OPTION +#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name) \ + RegisterSetter(bool_options, #name, \ + [&](bool value) { runtime_options.aliased_name = value; }); \ + RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \ + return std::to_string(runtime_options.aliased_name); \ + }) + + ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval); + +#undef ADD_RUNTIME_BOOL_OPTION + #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType) \ RegisterSetter(uint64_options, #name, [&](std::uint64_t value) { \ PADDLE_ENFORCE_LT( \ @@ -171,6 +188,7 @@ IpuStrategy::IpuStrategy() { ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold, mergeVarUpdateMemThreshold); ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak); + ADD_POPART_UINT64_OPTION_ALIAS(replicated_graph_count, replicatedGraphCount); ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor); ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler); ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor, @@ -462,12 +480,30 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor, } else if (opt == "use_io_tiles_to_store") { settings->location.storageTileSet = value > 0 ? popart::TileSet::IO : popart::TileSet::Compute; + } else if (opt == "sharding_domain_with_all") { + settings->location.shardingDomain = + popart::CommGroup(popart::CommGroupType::All, value); + } else if (opt == "sharding_domain_with_consecutive") { + settings->location.shardingDomain = + popart::CommGroup(popart::CommGroupType::Consecutive, value); + } else if (opt == "sharding_domain_with_orthogonal") { + settings->location.shardingDomain = + popart::CommGroup(popart::CommGroupType::Orthogonal, value); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unknown option ' %s' for tensor location: %s", opt, tensor)); } } +void IpuStrategy::SetAccumulateOuterFragmentSettings( + const std::uint64_t& schedule, const std::vector& values) { + VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule; + auto schedule_ = + static_cast(schedule); + popart_options.accumulateOuterFragmentSettings = + popart::AccumulateOuterFragmentSettings(schedule_, values); +} + void IpuStrategy::AddCustomOp(const std::string& paddle_op, const std::string& popart_op, const std::string& domain, int version) { diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 571fb1e163..786e2419cc 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -24,6 +24,11 @@ namespace paddle { namespace platform { namespace ipu { +struct RuntimeOptions { + // enable the eval mode in training by switching optimizers. + bool enable_eval = false; +}; + class IpuStrategy { public: IpuStrategy(); @@ -32,19 +37,24 @@ class IpuStrategy { // training flag, true for training bool is_training = true; - // save the onnx model lowered by paddle program description - bool save_init_onnx = false; - - // save the trained model - bool save_onnx_checkpoint = false; - // average sharding, debugging used bool need_avg_shard = false; // flag for fp16, true for pure fp16 bool enable_fp16 = false; - // Number ipus total needed, replica * ipu_per_replica + // enable transfer cast Op target from fp32 to fp16 in fp16 mode + bool transfer_cast_op = true; + + // The mode of Adam/Lamb optimizer + // false: The standard Adam/Lamb optimizer + // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART + bool use_no_bias_optimizer = false; + + // enable distributed computing for POD128 or POD256 + bool enable_distribution = false; + + // Number ipus total needed, local_replica * ipu_per_replica int num_ipus = 1; // batches per step @@ -53,8 +63,8 @@ class IpuStrategy { // micro batch-size int micro_batch_size = 1; - // save paddle model per n steps - int save_per_n_step = 1; + // random seed + std::uint64_t random_seed = std::numeric_limits::max(); // TODO(alleng) remove this param // available memory proportion, 0.0f for disable @@ -67,6 +77,29 @@ class IpuStrategy { // defaultMaxWeightNorm for adam optimizer float max_weight_norm = 65504.0f; + // file path for dumping compiled model in onnx format + std::string onnx_dump_path; + + // Data type to use for tensor that stores first-order momentum optimizer + // state. FLOAT or FLOAT16 + std::string accl1_type = "FLOAT"; + + // Data type to use for tensor that stores second-order momentum optimizer + // state. FLOAT or FLOAT16 + std::string accl2_type = "FLOAT"; + + // Data type to use for tensor that stores third-order momentum optimizer + // state. FLOAT or FLOAT16 + std::string accl3_type = "FLOAT"; + + // WeightDecayMode for setting the optimizer + // if set, it will override other settings + // value must be one of "decay" or "l2_regularization" or not set + std::string weight_decay_mode = ""; + + // Runtime Options + RuntimeOptions runtime_options; + // popart session option popart::SessionOptions popart_options; @@ -86,6 +119,8 @@ class IpuStrategy { const std::string &value); void SetTensorLocation(const std::string &tensor, const std::string &option, std::uint64_t value); + void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule, + const std::vector &values); void AddCustomOp(const std::string &paddle_op, const std::string &popart_op, const std::string &domain, int version); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc index 3ec1999edc..0339097d58 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc @@ -32,30 +32,10 @@ const std::string GenerateOpName() { const std::string CreateOpIdentifyId(Node *node) { // format: - // if has custom op_namescope: - // {op_namescope}/op_type/_gen_* - // else: - // {op_type}/{out_var0}/{out_var1}/.../_gen_* + // op_type/_gen_* // this name will be used as op name when exporting onnx model from popart auto op_type = node->Name(); - std::string op_namescope; - if (node->Op()->HasAttr("op_namescope")) { - op_namescope = - BOOST_GET_CONST(std::string, node->Op()->GetAttr("op_namescope")); - } else { - op_namescope = "/"; - } - - if (op_namescope != "/") { - return {op_namescope + op_type + "/" + GenerateOpName()}; - } else { - std::string op_out = ""; - for (auto *out_node : node->outputs) { - op_out += "/"; - op_out += out_node->Name(); - } - return {op_type + op_out + "/" + GenerateOpName()}; - } + return {op_type + "/" + GenerateOpName()}; } Node *MakeVarNode(Graph *graph, Node *node) { @@ -122,6 +102,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type, if (node->Op()->HasAttr(sMatmulSerializeMode)) { CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op()); } + if (node->Op()->HasAttr(sAvailMemAttribute)) { + CopyOpAttr(sAvailMemAttribute, node->Op(), new_node->Op()); + } + if (node->Op()->HasAttr(sOpNamescope)) { + CopyOpAttr(sOpNamescope, node->Op(), new_node->Op()); + } { new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node)); new_node->Op()->Flush(); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ed42d0792e..bbaa7e3dd6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -4264,6 +4264,7 @@ All parameter, weight, gradient are variables in Paddle. platform::ipu::IpuBackend::GetInstance()); }, py::return_value_policy::reference) + .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost) .def("detach", &platform::ipu::IpuBackend::Detach) .def("reset", &platform::ipu::IpuBackend::Reset) .def("set_scope", &platform::ipu::IpuBackend::SetScope) @@ -4311,6 +4312,15 @@ All parameter, weight, gradient are variables in Paddle. option_name, option.first.cast(), option.second.cast()); } + } else if (option_name == "accumulate_outer_fragment") { + for (auto option : element.second.cast()) { + std::vector values; + for (auto value : option.second.cast()) { + values.push_back(value.cast()); + } + self.SetAccumulateOuterFragmentSettings( + option.first.cast(), values); + } } else if (option_name == "custom_op") { std::string paddle_op; std::string popart_op; diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py index f120f55949..debd9ed198 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py @@ -26,7 +26,13 @@ class TestIpuStrategy(unittest.TestCase): def test_set_options(self): ipu_strategy = paddle.static.IpuStrategy() all_option_names = ipu_strategy._ipu_strategy.get_all_option_names() + skip_options = [] + skip_options.append('random_seed') + for option_name in all_option_names: + if option_name in skip_options: + continue + option = ipu_strategy._ipu_strategy.get_option(option_name) option_type = option['type'] option_value = option['value'] @@ -38,9 +44,13 @@ class TestIpuStrategy(unittest.TestCase): set_value = not option_value else: continue - ipu_strategy.set_options({option_name: set_value}) - new_value = ipu_strategy.get_option(option_name) - assert new_value == set_value, f"set {option_name} to {set_value} failed" + + try: + ipu_strategy.set_options({option_name: set_value}) + new_value = ipu_strategy.get_option(option_name) + assert new_value == set_value, f"set {option_name} to {set_value} failed" + except: + raise Exception(f"set {option_name} to {set_value} failed") def test_set_string_options(self): ipu_strategy = paddle.static.IpuStrategy() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py index 3a69487306..ba6eb4d38b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py @@ -95,12 +95,9 @@ class TestBase(IPUOpTest): is_training=self.attrs['is_training']) ipu_strategy.set_precision_config( enable_fp16=self.attrs['enable_fp16']) - ipu_strategy.set_options({ - 'save_per_n_step': self.attrs['save_at_step'] - }) - program = paddle.static.IpuCompiledProgram( - main_prog, ipu_strategy=ipu_strategy).compile( - self.feed_list, fetch_list) + ipu_program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy) + program = ipu_program.compile(self.feed_list, fetch_list) result = [] run_steps = self.attrs['steps'] if save_otherwise_load \ @@ -111,10 +108,9 @@ class TestBase(IPUOpTest): for i in range(run_steps): tmp = exe.run(program, feed=feed, fetch_list=fetch_list) - # currently, we update opt state every sess.run, - # will optimize if save_otherwise_load and \ i == self.attrs['save_at_step'] - 1: + ipu_program._backend.weights_to_host() paddle.static.save(main_prog, self.attrs['model_path'].name) -- GitLab