From b2aee3e3391d692fa6f639bd87f72fd14ea5b3f8 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Mon, 17 Jan 2022 17:09:39 +0800 Subject: [PATCH] [IPU] update ipu_backend p0 (#38854) * update ipu_backend * sync with paddle internal Co-authored-by: Xiaobing Wang Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Haicheng Jiang Co-authored-by: Han Zhao * apply comments 01 * update error messag * restore ipu_executor and ipu_optimizer * add clang-format on Co-authored-by: Xiaobing Wang Co-authored-by: Zhixin Yao Co-authored-by: Haicheng Jiang Co-authored-by: Han Zhao --- .../fluid/platform/device/ipu/CMakeLists.txt | 23 +- paddle/fluid/platform/device/ipu/device.cc | 39 -- .../fluid/platform/device/ipu/ipu_backend.cc | 176 ++----- .../fluid/platform/device/ipu/ipu_backend.h | 72 ++- .../fluid/platform/device/ipu/ipu_compiler.cc | 477 +++++++++++++----- .../fluid/platform/device/ipu/ipu_compiler.h | 109 ++-- .../fluid/platform/device/ipu/ipu_device.cc | 55 ++ .../device/ipu/{device.h => ipu_device.h} | 22 +- paddle/fluid/platform/device/ipu/ipu_info.cc | 13 +- paddle/fluid/platform/device/ipu/ipu_info.h | 2 + .../device/ipu/{common.h => ipu_names.h} | 3 + .../fluid/platform/device/ipu/ipu_strategy.cc | 21 +- .../fluid/platform/device/ipu/ipu_strategy.h | 68 ++- paddle/fluid/platform/device/ipu/ipu_utils.cc | 37 +- paddle/fluid/platform/device/ipu/ipu_utils.h | 159 +++++- .../device/ipu/supported_ops_custom.h | 21 + 16 files changed, 873 insertions(+), 424 deletions(-) delete mode 100644 paddle/fluid/platform/device/ipu/device.cc create mode 100644 paddle/fluid/platform/device/ipu/ipu_device.cc rename paddle/fluid/platform/device/ipu/{device.h => ipu_device.h} (65%) rename paddle/fluid/platform/device/ipu/{common.h => ipu_names.h} (85%) create mode 100644 paddle/fluid/platform/device/ipu/supported_ops_custom.h diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt index 9be12cbf6d4..5f711937a80 100644 --- a/paddle/fluid/platform/device/ipu/CMakeLists.txt +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -1,12 +1,19 @@ IF(WITH_IPU) FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc) - cc_library(ipu_device SRCS device.cc DEPS enforce popart) - cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart) - cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce) - cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce) - cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto) - cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils) - cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper) - cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper) + list(APPEND PADDLE_IPU_SRC ${POPART_CANONICALIZATION_SRC}) + set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "") + set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "") + set(IPU_BACKEND_SRC + "ipu_device.cc" + "ipu_strategy.cc" + "ipu_executor.cc" + "ipu_compiler.cc" + "ipu_backend.cc" + "ipu_utils.cc" + ) + + cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph framework_proto enforce graph_helper timer) cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend) + cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart) + add_dependencies(paddle_ipu ipu_backend) ENDIF() diff --git a/paddle/fluid/platform/device/ipu/device.cc b/paddle/fluid/platform/device/ipu/device.cc deleted file mode 100644 index 47e6475089d..00000000000 --- a/paddle/fluid/platform/device/ipu/device.cc +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/device/ipu/device.h" - -namespace paddle { -namespace platform { -namespace ipu { - -Device::Device(const popart::DeviceInfo& device_info) - : id_(device_info.getId()), is_attached_(device_info.isAttached()) { - popart::DeviceType popart_device_type = device_info.getType(); - switch (popart_device_type) { - case popart::DeviceType::IpuModel: - device_type_ = DeviceType::IpuModel; - break; - case popart::DeviceType::Ipu: - device_type_ = DeviceType::Ipu; - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "popart::DeviceType:Unsupported type %d", popart_device_type)); - } -} - -} // namespace ipu -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc index cd0f5ae554c..2471e15e09e 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.cc +++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/ipu/ipu_backend.h" -#include "paddle/fluid/platform/ipu/ipu_utils.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" @@ -24,170 +24,92 @@ namespace paddle { namespace platform { namespace ipu { -std::shared_ptr IpuBackend::instance_ = nullptr; +IpuBackend* IpuBackend::GetInstance() { + static IpuBackend instance; + return &instance; +} IpuBackend::IpuBackend() { - compiler_ = std::make_shared(); + compiler_ = std::make_unique(); executor_ = std::make_unique(); } -void IpuBackend::Clear() { +IpuBackend::~IpuBackend() { + compiler_.reset(); executor_.reset(); - // detach device - if (device_ != nullptr && device_->isAttached()) { - device_->detach(); - device_.reset(); - device_ = nullptr; - } -} - -IpuBackend::~IpuBackend() { Clear(); } - -std::shared_ptr IpuBackend::GetInstance() { - if (!instance_) { - instance_.reset(new IpuBackend()); - } - return instance_; -} - -// This api should only call from python, always return a new object -std::shared_ptr IpuBackend::GetNewInstance() { - instance_.reset(new IpuBackend()); - return instance_; } -void IpuBackend::Compile(framework::ir::Graph* graph, +void IpuBackend::Compile(Graph* graph, const std::vector& feed_list, const std::vector& fetch_list) { VLOG(10) << "enter IpuBackend::Compile"; + compiler_->Prepare(); + executor_->SetCompilerResources(compiler_->GetResources()); + compiler_->InitInputs(graph, feed_list); + compiler_->LowerConstants(graph, scope_); compiler_->LowerWeights(graph, scope_); compiler_->LowerBody(graph); compiler_->InitOutputs(fetch_list); - executor_->SetWeights(compiler_->GetWeights()); + if (ipu_strategy_->is_training) { + compiler_->LowerOptimier(graph, scope_); + } + is_compiled_ = true; + // when call compile, means a new graph + is_prepared_ = false; VLOG(10) << "leave IpuBackend::Compile"; } -void IpuBackend::Run(const std::vector& inputs, - const std::vector& outputs, +void IpuBackend::Run(const std::vector& inputs, + const std::vector& outputs, const framework::ExecutionContext& ctx) { Prepare(); - auto inputs_id = compiler_->GetInputs(); - auto outputs_id = compiler_->GetOutputs(); - executor_->Run(inputs_id, inputs, outputs_id, outputs, ctx); + timer_->Start(); + executor_->Run(inputs, outputs, ctx); + timer_->Pause(); + VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)"; } void IpuBackend::Prepare() { - if (is_prepared_) { - return; - } else { + if (!is_prepared_) { + executor_->Prepare(compiler_->GetModelProto()); + timer_.reset(new platform::Timer()); is_prepared_ = true; } - // convert Model to fp16 - if (ipu_strategy_->enable_fp16) { - compiler_->ConvertProtoToFp16(); - } - auto proto = compiler_->GetModelProto(); - auto tensors = compiler_->GetTensors(); - auto outputs = compiler_->GetOutputs(); - executor_->Prepare(proto, tensors, outputs, device_); } -void IpuBackend::SetScope(const framework::Scope& scope) { +void IpuBackend::Detach() { executor_->Detach(); } + +void IpuBackend::Reset() { + executor_->Detach(); + compiler_.reset(); + executor_.reset(); +} + +void IpuBackend::SetScope(const Scope& scope) { scope_ = &scope; executor_->SetScope(&scope); } void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) { ipu_strategy_ = &strategy; - executor_->SetIpuStrategy(strategy); compiler_->SetIpuStrategy(strategy); + executor_->SetIpuStrategy(strategy); } -size_t IpuBackend::GetNumDevices() { - // IpuModel - bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); - if (ipu_model) return 1; - // Real dev - size_t num_devices = - popart::DeviceManager::createDeviceManager().enumerateDevices().size(); - PADDLE_ENFORCE_GT( - num_devices, 0, - platform::errors::Unavailable( - "Do not found any IPU devices, please make " - "sure Poplar sdk is enabled or enable ENV \"POPLAR_IPUMODEL=1\"")); - return num_devices; -} - -std::vector IpuBackend::GetDeviceIds() { - bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); - if (ipu_model) { - return {0}; - } - std::vector device_ids; - auto devices = - popart::DeviceManager::createDeviceManager().enumerateDevices(); - PADDLE_ENFORCE_GT( - devices.size(), 0, - platform::errors::Unavailable("Do not found any IPU devices, please make " - "sure Poplar sdk is enabled.")); - - for (auto device : devices) { - device_ids.push_back(device->getId()); - } - - return device_ids; -} - -Device IpuBackend::GetDevice(int id) { - bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); - if (ipu_model) { - std::map deviceOpts{{"numIPUs", "1 "}}; - device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice( - deviceOpts); - Device device(*device_.get()); - return device; - } - size_t num_devices = GetNumDevices(); - if (id < 0 || id >= num_devices) { - PADDLE_THROW(platform::errors::InvalidArgument( - "device id %d is invalid, number devices is %d", id, num_devices)); - } - std::shared_ptr popart_device_info = - popart::DeviceManager::createDeviceManager().getDevice( - popart::SyncPattern::Full, id); - Device device(*popart_device_info.get()); - return device; -} - -void IpuBackend::AttachDevice(int id) { - // trick here - // Compiler ipu is not same as the runtime ipu. - VLOG(10) << "comile ipu id = " << id; - bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); - if (ipu_model) { - return; - } - device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice( - UpperIpuNum()); - PADDLE_ENFORCE_NOT_NULL( - device_, platform::errors::Unavailable("Can't attach IPU, ipu_num = %d.", - UpperIpuNum())); +void IpuBackend::SetCustomOps( + const std::vector& custom_ops) { + compiler_->SetCustomOps(custom_ops); } -bool IpuBackend::DeviceIsAttached() { return device_ != nullptr; } - -// num_ipus must be pow(2,n); -int IpuBackend::UpperIpuNum() { - PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0, - platform::errors::Unavailable( - "The ipu num get is wrong, please make sure the " - "sharding or pipline parameter is right.")); - int i = 0; - while (std::pow(2, i) < ipu_strategy_->num_ipus) { - i++; +void IpuBackend::SaveMoldeProto(const std::string& path) { + if (ipu_strategy_->is_training && is_prepared_) { + executor_->SaveModelToHost(path); + } else if (is_compiled_) { + compiler_->SaveModelProtoNoCheck(path); + } else { + LOG(WARNING) << "Model is empty"; } - return std::pow(2, i); } } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h index 769a1b5b52a..122a3e08370 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.h +++ b/paddle/fluid/platform/device/ipu/ipu_backend.h @@ -14,88 +14,86 @@ limitations under the License. */ #pragma once -#include #include #include +#include -#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device/ipu/ipu_compiler.h" +#include "paddle/fluid/platform/device/ipu/ipu_device.h" +#include "paddle/fluid/platform/device/ipu/ipu_executor.h" +#include "paddle/fluid/platform/device/ipu/ipu_strategy.h" +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/ipu/device.h" -#include "paddle/fluid/platform/ipu/ipu_compiler.h" -#include "paddle/fluid/platform/ipu/ipu_executor.h" -#include "paddle/fluid/platform/ipu/ipu_strategy.h" +#include "paddle/fluid/platform/timer.h" namespace paddle { namespace platform { namespace ipu { +// IpuBackend is the center of paddle-ipu, its function include: +// 1. Compile paddle model to popart model +// 2. Run popart model, inference or training +// 3. Request and release device +// 4. Other helper function class IpuBackend { - // IpuBackend is the center of paddle-ipu, its function include: - // 1. Compile paddle model to popart model - // 2. Run popart model, inference or training - // 3. Request and release device - // 4. Other helper function + public: + static IpuBackend *GetInstance(); public: IpuBackend(); ~IpuBackend(); - void Clear(); - - // return if exsits, else create and return - static std::shared_ptr GetInstance(); - - // always return a new instance_ - static std::shared_ptr GetNewInstance(); - // what compile does include(call compiler_): // 1. map paddle-op -> poart op // 2. construct popart onnx compute graph - void Compile(framework::ir::Graph *graph, - const std::vector &feed_list, + void Compile(Graph *graph, const std::vector &feed_list, const std::vector &fetch_list); // what run does include: // 1. construct forward onnx graph // 2. graph-level optimization // 3. autodiff - void Run(const std::vector &inputs, - const std::vector &outputs, + void Run(const std::vector &inputs, + const std::vector &outputs, const framework::ExecutionContext &ctx); - Executor &GetExecutor() { return *executor_; } + // detach IPU manually + void Detach(); + + // reset manually + // call it before destruct works + void Reset(); - void SetScope(const framework::Scope &scope); - const framework::Scope *GetScope() { return scope_; } + void SetScope(const Scope &scope); + const Scope *GetScope() { return scope_; } void SetIpuStrategy(const IpuStrategy &strategy); const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; } + void SetCustomOps(const std::vector &custom_ops); - // Device - size_t GetNumDevices(); - std::vector GetDeviceIds(); - Device GetDevice(int id); - void AttachDevice(int id); - bool DeviceIsAttached(); + // save compiled model to onnx + void SaveMoldeProto(const std::string &path); private: - int UpperIpuNum(); void Prepare(); private: - std::shared_ptr compiler_; + std::unique_ptr compiler_; std::unique_ptr executor_; - std::shared_ptr device_; + bool is_compiled_ = false; bool is_prepared_ = false; // not own - const framework::Scope *scope_ = nullptr; + const Scope *scope_ = nullptr; const IpuStrategy *ipu_strategy_ = nullptr; private: - static std::shared_ptr instance_; + // time record for IpuBackend::Run + std::unique_ptr timer_; + + DISABLE_COPY_AND_ASSIGN(IpuBackend); }; } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index 58f784fdbc9..8bedca5c0b8 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -12,17 +12,66 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/ipu/ipu_compiler.h" +#include "paddle/fluid/platform/device/ipu/ipu_compiler.h" +#include +#include +#include +#include #include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/platform/ipu/ipu_utils.h" +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" namespace paddle { namespace platform { namespace ipu { +popart::AdamMode AdamModeFromStr(const std::string& str) { + if (str == "adam") { + return popart::AdamMode::Adam; + } else if (str == "adamax") { + return popart::AdamMode::AdaMax; + } else if (str == "lamb") { + return popart::AdamMode::Lamb; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Uknown AdamMode: %s, AdamMode must be one of these values: adam, " + "adamax or lamb", + str)); + } +} + +popart::AdaptiveMode AdaptiveModeFromStr(const std::string& str) { + if (str == "adadelta") { + return popart::AdaptiveMode::AdaDelta; + } else if (str == "adagrad") { + return popart::AdaptiveMode::AdaGrad; + } else if (str == "rmsprop") { + return popart::AdaptiveMode::RMSProp; + } else if (str == "centered_rmsprop") { + return popart::AdaptiveMode::CenteredRMSProp; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Uknown AdaptiveMode: %s, AdaptiveMode must be one of these values: " + "adadelta, adagrad, rmsprop or centered_rmsprop", + str)); + } +} + +popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) { + if (str == "decay") { + return popart::WeightDecayMode::Decay; + } else if (str == "l2_regularization") { + return popart::WeightDecayMode::L2Regularization; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Uknown WeightDecayMode: %s, WeightDecayMode must be decay or " + "l2_regularization", + str)); + } +} + template -T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) { +T GetAttrAllowNull(std::string attr, OpDesc* op_desc) { if (op_desc->HasAttr(attr)) { return BOOST_GET_CONST(T, op_desc->GetAttr(attr)); } else { @@ -31,8 +80,7 @@ T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) { } template -nonstd::optional GetOptAttrAllowNull(std::string attr, - framework::OpDesc* op_desc) { +nonstd::optional GetOptAttrAllowNull(std::string attr, OpDesc* op_desc) { if (op_desc->HasAttr(attr)) { return BOOST_GET_CONST(T, op_desc->GetAttr(attr)); } else { @@ -40,19 +88,36 @@ nonstd::optional GetOptAttrAllowNull(std::string attr, } } -Compiler::Compiler() { - builder_ = popart::Builder::create(); - RegisterOpFunc(); +template +TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) { + if (op_desc->HasAttr(attr)) { + auto x = BOOST_GET_CONST(TI, op_desc->GetAttr(attr)); + return static_cast(x); + } else { + return {}; + } +} + +Compiler::Compiler() { RegisterOpFunc(); } + +Compiler::~Compiler() { + builder_.reset(); + resources_.reset(); } -Compiler::~Compiler() {} +void Compiler::Prepare() { + builder_ = popart::Builder::create(); + resources_ = std::make_unique(); +} void Compiler::RegisterOpFunc() { VLOG(10) << "enter Compiler::RegisterOpFunc"; #define INT_VEC std::vector +#define INT32_VEC std::vector #define FLOAT_VEC std::vector #define FLOAT float #define INT std::int64_t +#define INT32 std::int32_t #define BOOL bool #define STRING std::string #define STRING_VEC std::vector @@ -60,6 +125,7 @@ void Compiler::RegisterOpFunc() { #define ARG(Type, Name) , GetAttrAllowNull(#Name, op_desc) #define OPT_ARG(Type, Name) , GetOptAttrAllowNull(#Name, op_desc) +#define SIG_ARG(TI, TO, Name) , GetCastSigAttrAllowNull(#Name, op_desc) #define POPART_CONST_ARG(Name) , const PopartConstant& Name #define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name #define POPART_ATTRIB_VEC_ARG(Name) @@ -67,7 +133,7 @@ void Compiler::RegisterOpFunc() { name_function_ = { #define OP_DECL(FuncName, OnnxImpl, Args) \ - {#FuncName, [&](framework::OpDesc* op_desc) { \ + {#FuncName, [&](OpDesc* op_desc) { \ auto op_type = op_desc->Type(); \ VLOG(10) << "build op:" << op_type << " args " << #Args; \ auto inputs = GetOpInputs(op_desc); \ @@ -77,9 +143,12 @@ void Compiler::RegisterOpFunc() { auto aiOnnxOpset = builder_->aiOnnxOpset11(); \ auto output_ids = OnnxImpl(inputs Args, debug_context); \ SetIpuIndexStage(output_ids, op_desc); \ + SetAMPAttributes(output_ids, op_desc); \ + SetSerializeAttributes(output_ids, op_desc); \ InsertTensors(output_names, output_ids); \ }}, // NOLINT -#include "paddle/fluid/platform/ipu/supported_ops_autogen.h" +#include "paddle/fluid/platform/device/ipu/supported_ops_autogen.h" +#include "paddle/fluid/platform/device/ipu/supported_ops_custom.h" }; #undef OP_DECL @@ -87,146 +156,99 @@ void Compiler::RegisterOpFunc() { #undef POPART_ATTRIB_VEC_ARG #undef HOST_SIDE_CONST_ARG #undef POPART_CONST_ARG +#undef SIG_ARG #undef OPT_ARG #undef ARG #undef NONE #undef STRING_VEC #undef STRING #undef BOOL +#undef INT32 #undef INT #undef FLOAT #undef FLOAT_VEC +#undef INT32_VEC #undef INT_VEC } -void Compiler::LowerBody(const framework::ir::Graph* graph) { +void Compiler::LowerBody(const Graph* graph) { VLOG(10) << "enter Compiler::LowerBody"; auto nodes = framework::ir::TopologySortOperations(*graph); for (auto* node : nodes) { auto* op_desc = node->Op(); auto op_type = op_desc->Type(); - VLOG(10) << "node->type: " << op_type; + VLOG(10) << "lowering op: " << op_type; if (op_type == "popart_constant") { - auto dims = - BOOST_GET_CONST(std::vector, op_desc->GetAttr("dims")); - auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype")); - auto dtype = OnnxDtype2PopartType(dtype_); - popart::TensorInfo tensor_info{dtype, dims}; - auto value_attr = op_desc->GetAttr("value"); - auto const_data = std::unique_ptr{}; - switch (dtype) { - case popart::DataType::FLOAT: - const_data.reset(new popart::ConstVoidData( - BOOST_GET_CONST(std::vector, value_attr).data(), - tensor_info)); - break; - case popart::DataType::INT32: - const_data.reset(new popart::ConstVoidData( - BOOST_GET_CONST(std::vector, value_attr).data(), - tensor_info)); - break; - case popart::DataType::DOUBLE: - const_data.reset(new popart::ConstVoidData( - BOOST_GET_CONST(std::vector, value_attr).data(), - tensor_info)); - break; - case popart::DataType::INT64: - const_data.reset(new popart::ConstVoidData( - BOOST_GET_CONST(std::vector, value_attr).data(), - tensor_info)); - break; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "The popart datatype is not supported, popart::DataType is %d", - dtype)); - } - popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data); - SetIpuIndexStage(result, op_desc); - InsertTensors(GetOpOutputs(op_desc), result); - } else if (op_type == "popart_batchnormalization") { + // pass + } else if (op_type == "popart_optimizer") { + // pass + } else if (op_type == "popart_checkpointoutput") { auto inputs = GetOpInputs(op_desc); auto outputs = GetOpOutputs(op_desc); - auto num_outputs = outputs.size(); - auto epsilon = BOOST_GET_CONST(float, op_desc->GetAttr("epsilon")); - auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum")); - auto result = builder_->aiOnnxOpset11().batchnormalization( - inputs, num_outputs, epsilon, momentum); - SetIpuIndexStage(result, op_desc); - InsertTensors(GetOpOutputs(op_desc), result); - } else if (op_type == "popart_nllloss") { - auto inputs = GetOpInputs(op_desc); - auto ignoreIndex = BOOST_GET_CONST(int, op_desc->GetAttr("ignoreIndex")); - auto result = builder_->aiGraphcoreOpset1().nllloss( - inputs, popart::ReductionType::NoReduction, ignoreIndex); - SetIpuIndexStage(result, op_desc); - InsertTensors(GetOpOutputs(op_desc), result); - } else if (op_type == "popart_topk") { + auto output_ids = builder_->checkpointOutput(inputs); + InsertTensors(outputs, output_ids); + } else if (op_type == "popart_custom_op") { auto inputs = GetOpInputs(op_desc); auto outputs = GetOpOutputs(op_desc); - int64_t axis = BOOST_GET_CONST(int64_t, op_desc->GetAttr("axis")); - int sorted_INT32 = BOOST_GET_CONST(int, op_desc->GetAttr("sorted")); - int64_t sorted = int64_t{sorted_INT32}; - - auto aiOnnxOpset = builder_->aiOnnxOpset11(); - - popart::ConvInputs result; - if (inputs.size() == 2) { - VLOG(10) - << "[Compiler::LowerBody] size of inputs for is 2"; - result = aiOnnxOpset.topk(inputs, axis, sorted); - } else if (inputs.size() == 1) { - VLOG(10) - << "[Compiler::LowerBody] size of inputs for is 1"; - int64_t k = BOOST_GET_CONST(int64_t, op_desc->GetAttr("k")); - popart::TensorInfo kShape{"INT64", std::vector{1}}; - popart::ConstVoidData kData = {&k, kShape}; - auto K_t = aiOnnxOpset.constant(kData); - result = aiOnnxOpset.topk({inputs[0], K_t}, axis, sorted); + auto debug_context = BuildDebugContext(op_desc); + auto attributes = std::map{}; + for (auto& attr : op_desc->GetAttrMap()) { + CustomOpAttrVisitor visitor(&attributes, attr.first); + boost::apply_visitor(visitor, attr.second); } - result[1] = aiOnnxOpset.cast({result[1]}, "INT32"); - SetIpuIndexStage(result, op_desc); - VLOG(10) << "[Compiler::LowerBody] output[1]: " << outputs[1]; - VLOG(10) << "[Compiler::LowerBody] output[1]: " - << GetOpOutputs(op_desc)[1] << " -> " << result[1]; - tensors_.emplace(GetOpOutputs(op_desc)[1], result[1]); // topk indices - VLOG(10) << "[Compiler::LowerBody] output[0]: " << outputs[0]; - VLOG(10) << "[Compiler::LowerBody] output[0]: " - << GetOpOutputs(op_desc)[0] << " -> " << result[0]; - tensors_.emplace(GetOpOutputs(op_desc)[0], result[0]); // topk values + auto __op_type = + BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type")); + VLOG(10) << "Build graph from custom op: " << __op_type; + auto it = custom_ops_.find(__op_type); + auto output_ids = + builder_->customOp(it->second.popart_op, it->second.popart_op.version, + inputs, outputs.size(), attributes, debug_context); + SetIpuIndexStage(output_ids, op_desc); + InsertTensors(outputs, output_ids); + } else if (op_type == "popart_printtensor") { + auto inputs = GetOpInputs(op_desc); + auto outputs = GetOpOutputs(op_desc); + auto debug_context = BuildDebugContext(op_desc); + auto print_gradient = + BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient")); + auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title")); + auto output_ids = builder_->aiGraphcoreOpset1().printtensor( + inputs, print_gradient, debug_context, title); + SetIpuIndexStage(output_ids, op_desc); + InsertTensors(outputs, output_ids); } else { auto itr = name_function_.find(op_type); if (itr != name_function_.end()) { itr->second(node->Op()); } else { PADDLE_THROW(platform::errors::NotFound( - "Op %s is not registered in popart canonicalization", op_type)); + "%s is not registered, please check for unsupported operators for " + "running on IPU", + op_type)); } } } VLOG(10) << "leave Compiler::LowerBody"; } -void Compiler::InitInputs(framework::ir::Graph* graph, +void Compiler::InitInputs(Graph* graph, const std::vector& feed_list) { for (const auto& feed_name : feed_list) { feed_list_.push_back(feed_name); - for (const framework::ir::Node* n : graph->Nodes()) { + for (const Node* n : graph->Nodes()) { if (n->IsVar()) { auto* var_desc = n->Var(); if (feed_name == var_desc->Name()) { VLOG(10) << "feed_name= " << var_desc->Name(); auto data_type = VarType2PopartType(var_desc->GetDataType()); - if (ipu_strategy_->enable_fp16) { - data_type = popart::DataType::FLOAT16; - } popart::TensorInfo input_info{data_type, var_desc->GetShape()}; VLOG(10) << "popart input_info = " << input_info; popart::TensorId tensor_id = builder_->addInputTensor(input_info, feed_name); VLOG(10) << "popart input tensor id = " << tensor_id; - inputs_.push_back(tensor_id); - tensors_.emplace(var_desc->Name(), tensor_id); + resources_->inputs.push_back(tensor_id); + resources_->tensors.emplace(var_desc->Name(), tensor_id); } } } @@ -236,20 +258,58 @@ void Compiler::InitInputs(framework::ir::Graph* graph, void Compiler::InitOutputs(const std::vector& fetch_list) { for (const auto& fetch_name : fetch_list) { fetch_list_.push_back(fetch_name); - auto tensor = tensors_.find(fetch_name); - PADDLE_ENFORCE_NE(tensor, tensors_.end(), - platform::errors::NotFound( - "output tensor %s does not exist.", fetch_name)); + auto tensor = resources_->tensors.find(fetch_name); + PADDLE_ENFORCE_NE( + tensor, resources_->tensors.end(), + platform::errors::NotFound( + "Output tensor %s is not found, please check the model.", + fetch_name)); VLOG(10) << "fetch_name= " << fetch_name; VLOG(10) << "popart output tensor id = " << tensor->second; builder_->addOutputTensor(tensor->second); - outputs_.push_back(tensor->second); + resources_->outputs.push_back(tensor->second); + } +} + +void Compiler::LowerConstants(const Graph* graph, const Scope* scope) { + auto& kid_scope = scope->NewScope(); + VLOG(10) << "enter Compiler::LowerConstants"; + for (auto* node : graph->Nodes()) { + if (!node->IsOp()) { + continue; + } + + auto* op_desc = node->Op(); + auto op_type = op_desc->Type(); + if (op_type == "popart_constant") { + auto shape = + BOOST_GET_CONST(std::vector, op_desc->GetAttr("dims")); + auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype")); + auto dtype = PopartType2VarType(OnnxDtype2PopartType(dtype_)); + auto tensor_name = op_desc->Output("__outputs__")[0]; + auto* var = kid_scope.Var(tensor_name); + VLOG(10) << "lowering constant: " << tensor_name; + auto* tensor = var->GetMutable(); + ConstantOpAttrVisitor visitor(tensor, dtype); + auto value = op_desc->GetAttr("value"); + boost::apply_visitor(visitor, value); + auto ddim = framework::make_ddim(shape); + tensor->Resize(ddim); + + auto const_data = std::unique_ptr(); + popart::TensorInfo tensor_info(VarType2PopartType(tensor->type()), shape); + const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info)); + popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data); + SetIpuIndexStage(result, op_desc); + resources_->tensors.emplace(tensor_name, result); + } } + VLOG(10) << "leave Compiler::LowerConstants"; } -void Compiler::LowerWeights(const framework::ir::Graph* graph, - const framework::Scope* scope_) { - PADDLE_ENFORCE_NOT_NULL(scope_, +void Compiler::LowerWeights(const Graph* graph, const Scope* scope) { + VLOG(10) << "enter Compiler::LowerWeights"; + PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::PreconditionNotMet( "You should call set_scope before LowerWeights")); // at this step, the graph doesn't contains optimizer related states @@ -257,12 +317,12 @@ void Compiler::LowerWeights(const framework::ir::Graph* graph, if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { if (node->Var()->Persistable() && node->inputs.empty()) { auto var_name = node->Var()->Name(); - // workround: https://github.com/graphcore/Paddle/issues/151 - if (tensors_.count(var_name) != 0) { + if (resources_->tensors.count(var_name) != 0) { continue; } + VLOG(10) << "lowering weight: " << var_name; - auto var = scope_->FindVar(var_name); + auto var = scope->FindVar(var_name); if (var) { auto tensor = var->Get(); auto dtype = VarType2PopartType(tensor.type()); @@ -274,12 +334,113 @@ void Compiler::LowerWeights(const framework::ir::Graph* graph, popart::ConstVoidData const_data{tensor.data(), tensor_info}; popart::TensorId result = builder_->addInitializedInputTensor(const_data, var_name); - tensors_.emplace(var_name, result); - weights_.push_back(result); + resources_->tensors.emplace(var_name, result); + resources_->weights.push_back(result); } } } } + VLOG(10) << "leave Compiler::LowerWeights"; +} + +void Compiler::LowerOptimier(const Graph* graph, const Scope* scope) { + for (auto* node : graph->Nodes()) { + if (!node->IsOp()) { + continue; + } + + auto* op_desc = node->Op(); + auto op_type = op_desc->Type(); + if (op_type == "popart_optimizer") { + auto raw_type = + BOOST_GET_CONST(std::string, op_desc->GetAttr("raw_type")); + resources_->optimizer_type = raw_type; + auto loss_var = + BOOST_GET_CONST(std::string, op_desc->GetAttr("loss_var")); + resources_->loss_var = resources_->tensors[loss_var]; + resources_->with_lr_sched = + BOOST_GET_CONST(bool, op_desc->GetAttr("with_lr_sched")); + if (op_desc->HasAttr("lr_var")) { + auto lr_var = BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var")); + resources_->lr_var = lr_var; + resources_->lr = GetSingleVarFromScope(scope, lr_var); + } else { + // adadelta has no lr + resources_->lr = 0.01f; + resources_->with_lr_sched = false; + } + VLOG(10) << "Set initial lr: " << resources_->lr; + auto loss_scaling = ipu_strategy_->loss_scaling; + auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type")); + if (type == "sgd") { + auto weight_decay = + BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); + auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum")); + resources_->optimizer_fn = [=](float lr) { + return std::make_unique( + popart::OptimizerValue(lr, false), + popart::OptimizerValue(weight_decay, true), + popart::OptimizerValue(momentum, true), + popart::SGD::getUnsetDampening(), + popart::SGD::getUnsetVelocityScaling(), + popart::OptimizerValue(loss_scaling, true)); + }; + } else if (type == "adam") { + auto weight_decay = + BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); + auto beta1 = BOOST_GET_CONST(float, op_desc->GetAttr("beta1")); + auto beta2 = BOOST_GET_CONST(float, op_desc->GetAttr("beta2")); + auto eps = BOOST_GET_CONST(float, op_desc->GetAttr("eps")); + auto mwn = ipu_strategy_->max_weight_norm; + VLOG(10) << "set max_weight_norm: " << mwn; + auto adam_mode_ = + BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode")); + auto adam_mode = AdamModeFromStr(adam_mode_); + auto weight_decay_mode_ = + BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); + auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); + resources_->optimizer_fn = [=](float lr) { + return std::make_unique( + popart::OptimizerValue(lr, false), + popart::OptimizerValue(weight_decay, true), + popart::OptimizerValue(beta1, true), + popart::OptimizerValue(beta2, true), + popart::OptimizerValue(eps, true), + popart::OptimizerValue(loss_scaling, true), + popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode, + popart::DataType::UNDEFINED, popart::DataType::FLOAT, + popart::DataType::FLOAT); + }; + } else if (type == "adaptive") { + auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha")); + auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum")); + auto eps = BOOST_GET_CONST(float, op_desc->GetAttr("eps")); + auto weight_decay = + BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay")); + auto adaptive_mode_ = + BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode")); + auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_); + auto weight_decay_mode_ = + BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode")); + auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_); + resources_->optimizer_fn = [=](float lr) { + return std::make_unique( + popart::OptimizerValue(lr, false), + popart::OptimizerValue(weight_decay, true), + popart::OptimizerValue(alpha, true), + popart::OptimizerValue(momentum, true), + popart::OptimizerValue(eps, true), + popart::OptimizerValue(loss_scaling, true), adaptive_mode, + weight_decay_mode, popart::DataType::UNDEFINED, + popart::DataType::FLOAT, popart::DataType::FLOAT, + popart::DataType::FLOAT); + }; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "optimizer %s is not implemented", type)); + } + } + } } void Compiler::InsertTensors(const std::vector& output_names, @@ -288,7 +449,7 @@ void Compiler::InsertTensors(const std::vector& output_names, platform::errors::Fatal("InsertTensors size mismatch")); for (int i = 0; i < tensor_ids.size(); i++) { std::string tensor_id = tensor_ids[i]; - tensors_.emplace(output_names[i], tensor_ids[i]); + resources_->tensors.emplace(output_names[i], tensor_ids[i]); } } @@ -296,11 +457,11 @@ void Compiler::InsertTensors(const std::vector& output_names, const std::string& tensor_id) { PADDLE_ENFORCE_EQ(output_names.size(), 1, platform::errors::Fatal("InsertTensors size mismatch")); - tensors_.emplace(output_names[0], tensor_id); + resources_->tensors.emplace(output_names[0], tensor_id); } void Compiler::SetIpuIndexStage(const std::vector& tensor_ids, - const framework::OpDesc* op_desc) { + const OpDesc* op_desc) { VLOG(10) << "enter Compiler::SetIpuIndexStage"; auto tensor_ids_set = std::set(tensor_ids.begin(), tensor_ids.end()); @@ -321,7 +482,7 @@ void Compiler::SetIpuIndexStage(const std::vector& tensor_ids, } void Compiler::SetIpuIndexStage(const std::string& tensor_id, - const framework::OpDesc* op_desc) { + const OpDesc* op_desc) { VLOG(10) << "enter Compiler::SetIpuIndexStage"; if (op_desc->HasAttr(sIpuIndexAttr)) { @@ -339,20 +500,73 @@ void Compiler::SetIpuIndexStage(const std::string& tensor_id, VLOG(10) << "leave Compiler::SetIpuIndexStage"; } -std::vector& Compiler::GetWeights() { return weights_; } +void Compiler::SetAMPAttributes(const std::vector& tensor_ids, + const OpDesc* op_desc) { + if (op_desc->Type() == "popart_matmul") { + for (const auto& tensor_id : tensor_ids) { + SetAMPAttributes(tensor_id, op_desc); + } + } +} + +void Compiler::SetAMPAttributes(const std::string& tensor_id, + const OpDesc* op_desc) { + VLOG(10) << "enter Compiler::SetAMPAttributes"; + if (op_desc->Type() == "popart_matmul") { + auto amp = ipu_strategy_->available_memory_proportion; + if (amp > 0.0f && amp <= 1.0) { + builder_->setAvailableMemoryProportion(tensor_id, amp); + } + } + VLOG(10) << "leave Compiler::SetAMPAttributes"; +} + +void Compiler::SetSerializeAttributes( + const std::vector& tensor_ids, const OpDesc* op_desc) { + VLOG(10) << "enter Compiler::SetSerializeAttributes"; + auto tensor_ids_set = + std::set(tensor_ids.begin(), tensor_ids.end()); + + if (op_desc->Type() == "popart_matmul") { + if (op_desc->HasAttr(sMatmulSerializeFactor)) { + auto factor = + BOOST_GET_CONST(int, op_desc->GetAttr(sMatmulSerializeFactor)); + std::string mode = "output_channels"; + if (op_desc->HasAttr(sMatmulSerializeMode)) { + mode = BOOST_GET_CONST(std::string, + op_desc->GetAttr(sMatmulSerializeMode)); + } + builder_->setSerializeMatMul(tensor_ids_set, mode, (int64_t)factor, true); + } + } + VLOG(10) << "leave Compiler::SetSerializeAttributes"; +} + +void Compiler::SetSerializeAttributes(const std::string& tensor_id, + const OpDesc* op_desc) { + std::vector tensor_ids = {tensor_id}; + SetSerializeAttributes(tensor_ids, op_desc); +} -// convertFloatsToHalfs -void Compiler::ConvertProtoToFp16() { +void Compiler::SetCustomOps( + const std::vector& custom_ops) { + for (auto x : custom_ops) { + custom_ops_.emplace(x.paddle_op, x); + } +} + +std::string Compiler::GetFP16ModelProto() { popart::GraphTransformer graph_transformer(builder_->getModelProto()); graph_transformer.convertFloatsToHalfs(); - converted_proto_ = graph_transformer.getModelProto(); + return graph_transformer.getModelProto(); } std::string Compiler::GetModelProto() { - if (converted_proto_.length()) { - return converted_proto_; + if (ipu_strategy_->enable_fp16) { + return GetFP16ModelProto(); + } else { + return builder_->getModelProto(); } - return builder_->getModelProto(); } void Compiler::SaveModelProto(const std::string& path) { @@ -366,12 +580,12 @@ void Compiler::SaveModelProtoNoCheck(const std::string& path) { onnxfile.close(); } -std::vector Compiler::GetOpInputs(const framework::OpDesc* op) { +std::vector Compiler::GetOpInputs(const OpDesc* op) { auto ins = op->Input("__inputs__"); std::vector inputs; for (const auto& in : ins) { - if (tensors_.find(in) != tensors_.end()) { - inputs.push_back(tensors_[in]); + if (resources_->tensors.find(in) != resources_->tensors.end()) { + inputs.push_back(resources_->tensors[in]); } else { inputs.push_back(in); } @@ -379,12 +593,11 @@ std::vector Compiler::GetOpInputs(const framework::OpDesc* op) { return inputs; } -const std::vector& Compiler::GetOpOutputs( - const framework::OpDesc* op) { +const std::vector& Compiler::GetOpOutputs(const OpDesc* op) { return op->Output("__outputs__"); } -popart::DebugContext Compiler::BuildDebugContext(const framework::OpDesc* op) { +popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) { auto op_identify_id = BOOST_GET_CONST(std::string, op->GetAttr(sOpIdentifyIdAttr)); VLOG(10) << "op_identify_id of op: " << op->Type() << " is " diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h index ecee1595bb8..5576266b1a7 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.h +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h @@ -16,76 +16,119 @@ #include #include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/ipu/common.h" -#include "paddle/fluid/platform/ipu/ipu_strategy.h" +#include "paddle/fluid/platform/device/ipu/ipu_names.h" +#include "paddle/fluid/platform/device/ipu/ipu_strategy.h" +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" namespace paddle { namespace platform { namespace ipu { +struct CompilerResources { + // popart input tensor_ids + std::vector inputs; + // popart output tensor_ids + std::vector outputs; + // + std::map tensors; + // popart_weight_ids + std::vector weights; + // popart loss tensor_id + popart::TensorId loss_var; + // paddle lr var_name + std::string lr_var; + // lr value + float lr; + // flag for lr is constant or scheduling + bool with_lr_sched = false; + // paddle optimizer type, eg: momentum, lamb + std::string optimizer_type; + + using OptimizerFn = + std::function(float lr)>; + OptimizerFn optimizer_fn; + + public: + popart::Optimizer *Optimizer() { return optimizer.get(); } + + popart::Optimizer *NewOptimizer() { + optimizer = optimizer_fn(lr); + return optimizer.get(); + } + + popart::Optimizer *UpdateOptimizer(float lr_new) { + optimizer = optimizer_fn(lr_new); + return optimizer.get(); + } + + private: + std::unique_ptr optimizer; +}; + class Compiler { public: Compiler(); ~Compiler(); + void RegisterOpFunc(); - void LowerBody(const framework::ir::Graph *graph); - void InitInputs(framework::ir::Graph *graph, - const std::vector &feed_list); + void Prepare(); + void LowerBody(const Graph *graph); + void InitInputs(Graph *graph, const std::vector &feed_list); void InitOutputs(const std::vector &fetch_list); - void LowerWeights(const framework::ir::Graph *graph, - const framework::Scope *scope_); + void LowerConstants(const Graph *graph, const Scope *scope); + void LowerWeights(const Graph *graph, const Scope *scope); + void LowerOptimier(const Graph *graph, const Scope *scope); void InsertTensors(const std::vector &output_names, const std::vector &tensor_ids); void InsertTensors(const std::vector &output_names, const std::string &tensor_id); void SetIpuIndexStage(const std::vector &tensor_ids, - const framework::OpDesc *op_desc); - void SetIpuIndexStage(const std::string &tensor_id, - const framework::OpDesc *op_desc); - - std::vector GetInputs() { return inputs_; } - std::vector GetOutputs() { return outputs_; } - std::map GetTensors() { return tensors_; } - std::vector &GetWeights(); + const OpDesc *op_desc); + void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc); + void SetAMPAttributes(const std::vector &tensor_ids, + const OpDesc *op_desc); + void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc); + void SetSerializeAttributes(const std::vector &tensor_ids, + const OpDesc *op_desc); + void SetSerializeAttributes(const std::string &tensor_id, + const OpDesc *op_desc); - std::string GetModelProto(); void SetIpuStrategy(const IpuStrategy &strategy) { ipu_strategy_ = &strategy; - }; + } + + void SetCustomOps(const std::vector &custom_ops); + + CompilerResources *GetResources() { return resources_.get(); } + + std::string GetModelProto(); + std::string GetFP16ModelProto(); + void SaveModelProto(const std::string &path); void SaveModelProtoNoCheck(const std::string &path); - void ConvertProtoToFp16(); private: - std::vector GetOpInputs(const framework::OpDesc *op); - const std::vector &GetOpOutputs(const framework::OpDesc *op); - popart::DebugContext BuildDebugContext(const framework::OpDesc *op); + std::vector GetOpInputs(const OpDesc *op); + const std::vector &GetOpOutputs(const OpDesc *op); + popart::DebugContext BuildDebugContext(const OpDesc *op); private: std::unique_ptr builder_; + std::unique_ptr resources_; - using OpFunc = std::function; + using OpFunc = std::function; std::unordered_map name_function_; - // stateful variable - std::map tensors_; - // feed_list_ & fetch_list save paddle tensor id std::vector feed_list_; std::vector fetch_list_; - // inputs_ & outputs_ save popart tensor id - std::vector inputs_; - std::vector outputs_; - - // weights info map - std::vector weights_; - - std::string converted_proto_ = ""; const IpuStrategy *ipu_strategy_ = nullptr; + std::map custom_ops_; }; } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc new file mode 100644 index 00000000000..cd2a628c9ab --- /dev/null +++ b/paddle/fluid/platform/device/ipu/ipu_device.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device/ipu/ipu_device.h" +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" + +namespace paddle { +namespace platform { +namespace ipu { + +int GetNumDevices() { + bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + if (ipu_model) { + return 1; + } + int num_devices = + popart::DeviceManager::createDeviceManager().enumerateDevices().size(); + PADDLE_ENFORCE_GT(num_devices, 0, platform::errors::Unavailable( + "Do not found any IPU devices, please " + "make sure Poplar sdk is enabled")); + return num_devices; +} + +std::vector GetDeviceIds() { + bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); + if (ipu_model) { + return {0}; + } + std::vector device_ids; + auto devices = + popart::DeviceManager::createDeviceManager().enumerateDevices(); + PADDLE_ENFORCE_GT( + devices.size(), 0, + platform::errors::Unavailable("Do not found any IPU devices, please make " + "sure Poplar sdk is enabled.")); + for (auto device : devices) { + device_ids.push_back(device->getId()); + } + return device_ids; +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/device.h b/paddle/fluid/platform/device/ipu/ipu_device.h similarity index 65% rename from paddle/fluid/platform/device/ipu/device.h rename to paddle/fluid/platform/device/ipu/ipu_device.h index 24a8bdec308..3da13a522e1 100644 --- a/paddle/fluid/platform/device/ipu/device.h +++ b/paddle/fluid/platform/device/ipu/ipu_device.h @@ -21,23 +21,11 @@ namespace paddle { namespace platform { namespace ipu { -enum class DeviceType { IpuModel = 0, Cpu, Ipu, OfflineIpu, Sim }; - -class Device { - public: - Device() {} - explicit Device(const popart::DeviceInfo& device_info); - - int getId() const { return id_; } - bool isAttached() const { return is_attached_; } - DeviceType getType() const { return device_type_; } - - private: - int id_; - bool is_attached_; - DeviceType device_type_; - /* TODO:: Add more elements in the future */ -}; +// get the number of all avaliable IPUs +int GetNumDevices(); + +// get the device id of all avaliable IPUs +std::vector GetDeviceIds(); } // namespace ipu } // namespace platform diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc index c184149a9d3..4506bfbf972 100644 --- a/paddle/fluid/platform/device/ipu/ipu_info.cc +++ b/paddle/fluid/platform/device/ipu/ipu_info.cc @@ -10,23 +10,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/ipu/ipu_info.h" -#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_device.h" namespace paddle { namespace platform { //! Get a list of device ids from environment variable or use all. std::vector GetSelectedIPUDevices() { - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); - return ipu_backend->GetDeviceIds(); + return platform::ipu::GetDeviceIds(); } //! Get the total number of IPU devices in system. -int GetIPUDeviceCount() { - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); - return ipu_backend->GetNumDevices(); -} +int GetIPUDeviceCount() { return platform::ipu::GetNumDevices(); } + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_info.h b/paddle/fluid/platform/device/ipu/ipu_info.h index 3d032eeb4bf..fe7076e0b50 100644 --- a/paddle/fluid/platform/device/ipu/ipu_info.h +++ b/paddle/fluid/platform/device/ipu/ipu_info.h @@ -17,8 +17,10 @@ limitations under the License. */ namespace paddle { namespace platform { + std::vector GetSelectedIPUDevices(); int GetIPUDeviceCount(); + } // namespace platform } // namespace paddle #endif diff --git a/paddle/fluid/platform/device/ipu/common.h b/paddle/fluid/platform/device/ipu/ipu_names.h similarity index 85% rename from paddle/fluid/platform/device/ipu/common.h rename to paddle/fluid/platform/device/ipu/ipu_names.h index 7d62f10abd2..a809a8c6e5b 100644 --- a/paddle/fluid/platform/device/ipu/common.h +++ b/paddle/fluid/platform/device/ipu/ipu_names.h @@ -22,6 +22,8 @@ namespace ipu { static constexpr const char *sIpuIndexAttr = "ipu_index"; static constexpr const char *sIpuStageAttr = "ipu_stage"; +static constexpr const char *sMatmulSerializeFactor = "serialize_factor"; +static constexpr const char *sMatmulSerializeMode = "serialize_mode"; static constexpr const char *sOpIdentifyIdAttr = "op_identify_id"; static constexpr const char *sDebugInfoId = "__debug_info_id"; @@ -29,6 +31,7 @@ static constexpr const char *sBeta1 = "beta1"; static constexpr const char *sBeta2 = "beta2"; static constexpr const char *sBeta1Pow = "Beta1Pow"; static constexpr const char *sBeta2Pow = "Beta2Pow"; +static constexpr const char *sLossScaling = "LossScaling"; } // namespace ipu } // namespace platform diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index 47e7e332c8f..2ddead420d3 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -12,10 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/ipu/ipu_strategy.h" +#include "paddle/fluid/platform/device/ipu/ipu_strategy.h" +#include namespace paddle { namespace platform { -namespace ipu {} // namespace ipu +namespace ipu { + +void IpuStrategy::enablePattern(const std::string& t) { + VLOG(10) << "enable popart pattern: " << t; + popart_patterns.enablePattern(t, true); +} + +void IpuStrategy::disablePattern(const std::string& t) { + VLOG(10) << "disable popart pattern: " << t; + popart_patterns.enablePattern(t, false); +} + +const bool IpuStrategy::isPatternEnabled(const std::string& t) { + return popart_patterns.isPatternEnabled(t); +} + +} // namespace ipu } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 7e07d517e10..08f09b96cc0 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -14,24 +14,86 @@ limitations under the License. */ #pragma once +#include #include +#include +#include "popart/patterns/patterns.hpp" namespace paddle { namespace platform { namespace ipu { using VirtualGraphMode = popart::VirtualGraphMode; +using RecomputationType = popart::RecomputationType; struct IpuStrategy { + IpuStrategy() { + // we always save optimizer state to OffChip and enable rts for saving + // memory + auto storage = popart::TensorLocation(popart::TensorStorage::OffChip, + popart::ReplicatedTensorSharding::On); + popart_options.optimizerStateTensorLocationSettings = + popart::TensorLocationSettings(storage); + + // We divide the accumulationFactor and replicatedGraphCount after all + // reduce + popart_options.accumulationAndReplicationReductionType = + popart::ReductionType::Mean; + popart_options.meanAccumulationAndReplicationReductionStrategy = + popart::MeanReductionStrategy::Post; + + popart_options.enableFloatingPointChecks = false; + + // A directory for log traces to be written into. + popart_options.logDir = "popart_log"; + } + ~IpuStrategy() {} + + // Number ipus total needed, replica * ipu_per_replica int num_ipus = 1; + + // batches per step int batches_per_step = 1; - int batch_size = 1; + + // micro batch-size + int micro_batch_size = 1; + + // training flag, true for training bool is_training = true; + + // save the onnx model lowered by paddle program description bool save_init_onnx = false; - bool save_last_onnx = true; - popart::SessionOptions popart_options_; + + // save the trained model + bool save_onnx_checkpoint = false; + + // save paddle model per n steps + int save_per_n_step = 1; + + // average sharding, debugging used bool need_avg_shard = false; + + // flag for fp16, true for pure fp16 bool enable_fp16 = false; + + // available memory proportion, 0.0f for disable + float available_memory_proportion = 0.0f; + + // loss scaling, currently we can't get loss scaling from + // optimizer_extract_pass, so we have to set it here + float loss_scaling = 1.0f; + + // defaultMaxWeightNorm for adam optimizer + float max_weight_norm = 65504.0f; + + // popart session option + popart::SessionOptions popart_options; + popart::Patterns popart_patterns; + + public: + void enablePattern(const std::string& t); + void disablePattern(const std::string& t); + const bool isPatternEnabled(const std::string& t); }; } // namespace ipu diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc index 4dfe8c4efbe..6e221fae84e 100644 --- a/paddle/fluid/platform/device/ipu/ipu_utils.cc +++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc @@ -12,22 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/ipu/ipu_utils.h" +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" +#include namespace paddle { namespace platform { namespace ipu { -void* PaddleIArray::data() { return tensor_->data(); } +void* PaddleIArray::data() { return tensor_.data(); } popart::DataType PaddleIArray::dataType() const { - return VarType2PopartType(tensor_->type()); + return VarType2PopartType(tensor_.type()); } -std::size_t PaddleIArray::rank() const { return tensor_->dims().size(); } +std::size_t PaddleIArray::rank() const { return tensor_.dims().size(); } int64_t PaddleIArray::dim(size_t index) const { - return tensor_->dims().at(index); + return tensor_.dims().at(index); } std::size_t PaddleIArray::nelms() const { @@ -150,6 +151,32 @@ bool GetBoolEnv(std::string str) { } } +std::vector> GetOptPrePostfix( + const std::string& opt_type) { + // format: {popart_tensor_id, paddle_tensor_id}, ... + std::vector> pre_post_fix; + + if (opt_type == "adam" || opt_type == "lamb") { + pre_post_fix.push_back(std::make_pair("", "")); + pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0")); + pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0")); + pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0")); + } else if (opt_type == "sgd" || opt_type == "momentum") { + // sgd + pre_post_fix.push_back(std::make_pair("", "")); + } else { + pre_post_fix.push_back(std::make_pair("", "")); + // + } + + return pre_post_fix; +} + +int RequestIpus(const int num_ipus) { + // num_ipus must be pow(2, n); + return std::pow(2, ceil(log2(num_ipus))); +} + } // namespace ipu } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h index 3a3b9c8ccc2..3cd7115b5eb 100644 --- a/paddle/fluid/platform/device/ipu/ipu_utils.h +++ b/paddle/fluid/platform/device/ipu/ipu_utils.h @@ -17,14 +17,27 @@ limitations under the License. */ #include #include #include +#include -#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace platform { namespace ipu { +using float16 = platform::float16; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using Scope = framework::Scope; +using OpDesc = framework::OpDesc; +using Graph = framework::ir::Graph; +using Node = framework::ir::Node; +using BlockDesc = framework::BlockDesc; + // onnx dtype // https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3 enum ONNXDataType : int { @@ -49,14 +62,15 @@ enum ONNXDataType : int { class PaddleIArray final : public popart::IArray { public: - explicit PaddleIArray(framework::Tensor *tensor) : tensor_(tensor) { + explicit PaddleIArray(const Tensor* tensor) { + tensor_.ShareDataWith(*tensor); for (int i = 0; i < tensor->dims().size(); ++i) { shape_.push_back(tensor->dims().at(i)); } } public: - void *data(); + void* data(); popart::DataType dataType() const; std::size_t rank() const; int64_t dim(size_t index) const; @@ -64,7 +78,7 @@ class PaddleIArray final : public popart::IArray { const popart::Shape shape() const; private: - framework::Tensor *tensor_; + Tensor tensor_; std::vector shape_; }; @@ -74,8 +88,7 @@ popart::DataType OnnxDtype2PopartType(const int type); bool GetBoolEnv(std::string str); template -std::unique_ptr> Tensor2IArray( - const framework::Tensor &tensor) { +std::unique_ptr> Tensor2IArray(const Tensor& tensor) { auto dtype = VarType2PopartType(tensor.type()); auto shape = std::vector(); for (size_t i = 0; i < tensor.dims().size(); ++i) { @@ -84,18 +97,140 @@ std::unique_ptr> Tensor2IArray( popart::TensorInfo tensor_info(dtype, shape); return std::make_unique>( - reinterpret_cast(tensor.data()), tensor_info); + reinterpret_cast(tensor.data()), tensor_info); } template std::unique_ptr> LoDTensor2IArray( - framework::LoDTensor const &lod_tensor) { - PADDLE_ENFORCE_EQ( - lod_tensor.lod().size(), 0UL, - platform::errors::InvalidArgument("LoDTensor2IArray is Unimplemented")); - return Tensor2IArray(lod_tensor); + LoDTensor const& lod_tensor) { + if (lod_tensor.lod().size() == 0) { + return Tensor2IArray(lod_tensor); + } else { + PADDLE_THROW( + platform::errors::Unimplemented("LoDTensor2IArray is Unimplemented")); + } +} + +template +T GetSingleVarFromScope(const Scope* scope, const std::string& var_name) { + auto var = scope->GetVar(var_name); + auto tensor = var->Get(); + // check dtype is ? + return tensor.data()[0]; } +struct CustomOpAttrVisitor : public boost::static_visitor { + explicit CustomOpAttrVisitor(std::map* attr, + const std::string& attr_name) + : attrs_(attr), attr_name_(attr_name) {} + mutable std::map* attrs_; + std::string attr_name_; + + void operator()(int v) const { attrs_->emplace(attr_name_, v); } + void operator()(float v) const { attrs_->emplace(attr_name_, v); } + void operator()(const std::string& v) const { + attrs_->emplace(attr_name_, v); + } + void operator()(const std::vector& v) const { + attrs_->emplace(attr_name_, v); + } + void operator()(const std::vector& v) const { + attrs_->emplace(attr_name_, v); + } + void operator()(const std::vector& v) const { + attrs_->emplace(attr_name_, v); + } + void operator()(bool v) const { attrs_->emplace(attr_name_, v); } + void operator()(const std::vector& v) const { + attrs_->emplace(attr_name_, v); + } + void operator()(BlockDesc* desc) const { + PADDLE_THROW(platform::errors::Unavailable( + "Unsupported calling method for `BlockDesc` type.")); + } + void operator()(const std::vector& v) const { + PADDLE_THROW(platform::errors::Unavailable( + "Unsupported calling method for `BlockDesc` type.")); + } + void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); } + void operator()(const std::vector& v) const { + attrs_->emplace(attr_name_, v); + } + void operator()(const std::vector& v) const { + attrs_->emplace(attr_name_, v); + } + void operator()(boost::blank) const { + PADDLE_THROW(platform::errors::Unavailable( + "Unsupported calling method for `boost::blank` type.")); + } +}; + +struct IpuCustomOpIdentifier { + IpuCustomOpIdentifier(const std::string& _paddle_op, + const std::string& _popart_op, + const std::string& _domain, unsigned int _version) + : paddle_op(_paddle_op), popart_op(_domain, _popart_op, _version) {} + + std::string repr() { + std::ostringstream os; + os << "paddle_op: " << paddle_op << ", domain: " << popart_op.domain + << ", type: " << popart_op.type << ", version: " << popart_op.version; + return os.str(); + } + + std::string paddle_op; + popart::OperatorIdentifier popart_op; +}; + +struct ConstantOpAttrVisitor : public boost::static_visitor { + explicit ConstantOpAttrVisitor(framework::LoDTensor* tensor, + framework::proto::VarType::Type dtype) + : tensor_(tensor), dtype_(dtype) {} + framework::LoDTensor* tensor_; + framework::proto::VarType::Type dtype_; + + void operator()(const std::vector& vec) const { + framework::TensorFromVector(vec, tensor_); + } + void operator()(const std::vector& vec) const { + if (dtype_ == framework::proto::VarType::FP16) { + std::vector vec_fp16; + std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16), + [](float f) -> float16 { return float16(f); }); + framework::TensorFromVector(vec_fp16, tensor_); + } else { + framework::TensorFromVector(vec, tensor_); + } + } + void operator()(const std::vector& vec) const { + framework::TensorFromVector(vec, tensor_); + } + void operator()(const std::vector& vec) const { + framework::TensorFromVector(vec, tensor_); + } + void operator()(const std::vector& vec) const { + framework::TensorFromVector(vec, tensor_); + } + void RaiseError() const { + PADDLE_THROW( + platform::errors::InvalidArgument("Constant value must be a vector")); + } + void operator()(int v) const { RaiseError(); } + void operator()(float v) const { RaiseError(); } + void operator()(const std::string& v) const { RaiseError(); } + void operator()(const std::vector& v) const { RaiseError(); } + void operator()(bool v) const { RaiseError(); } + void operator()(BlockDesc* desc) const { RaiseError(); } + void operator()(const std::vector& v) const { RaiseError(); } + void operator()(int64_t v) const { RaiseError(); } + void operator()(boost::blank) const { RaiseError(); } +}; + +std::vector> GetOptPrePostfix( + const std::string& opt_type); + +int RequestIpus(const int num_ipus); + } // namespace ipu } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/supported_ops_custom.h b/paddle/fluid/platform/device/ipu/supported_ops_custom.h new file mode 100644 index 00000000000..02d215433c5 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/supported_ops_custom.h @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// clang-format off + +#pragma once + +OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT + +// clang-format on -- GitLab