From 0b7536073f12dc47525265bbe31fdeffd2c69b4b Mon Sep 17 00:00:00 2001 From: zhupengyang <1165938320@qq.com> Date: Mon, 23 Mar 2020 13:22:51 +0800 Subject: [PATCH] [NPU] cache om-models by inputs shape (#3242) --- lite/backends/npu/device.cc | 8 +-- lite/backends/npu/device.h | 5 +- lite/core/mir/subgraph/subgraph_detector.cc | 2 +- lite/core/mir/subgraph/subgraph_pass_test.cc | 41 +++------------ lite/kernels/npu/subgraph_compute.cc | 53 +++++++++++++++++--- lite/kernels/npu/subgraph_compute.h | 27 +++++++--- 6 files changed, 80 insertions(+), 56 deletions(-) diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc index d62ac9cad3..345b239c32 100644 --- a/lite/backends/npu/device.cc +++ b/lite/backends/npu/device.cc @@ -19,8 +19,8 @@ namespace paddle { namespace lite { namespace npu { -std::unique_ptr Device::Build( - std::string& model_name, // NOLINT +std::shared_ptr Device::Build( + const std::string model_name, // NOLINT std::vector& input_nodes, // NOLINT std::vector& output_nodes // NOLINT ) { @@ -41,15 +41,15 @@ std::unique_ptr Device::Build( ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } + // Create a HiAI model manager client to load the HiAI om model - std::unique_ptr model_client( + std::shared_ptr model_client( new hiai::AiModelMngerClient()); if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } - model_name = "model_" + std::to_string(model_count_++) + ".om"; auto model_desc = std::make_shared( model_name, freq_level(), framework_type(), model_type(), device_type()); model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h index 411600ae0a..6733a7f6df 100644 --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -40,8 +40,8 @@ class Device { // Build the HiAI IR graph to om model, return HiAI model manager client to // load om model and run inference. - std::unique_ptr Build( - std::string& model_name, // NOLINT + std::shared_ptr Build( + const std::string model_name, // NOLINT std::vector& input_nodes, // NOLINT std::vector& output_nodes // NOLINT ); // NOLINT @@ -51,7 +51,6 @@ class Device { int framework_type_{0}; int model_type_{0}; int device_type_{0}; - int model_count_{0}; }; } // namespace npu diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 994f346ced..c976a57c33 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -407,7 +407,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, cpp::OpDesc subgraph_op_desc; subgraph_op_desc.SetType("subgraph"); - // Create a new sub block desc for storing all of Ops an Vars of the target + // Create a new sub block desc for storing all of Ops and Vars of the target // subgraph and sub_block_idx is set as a attribute of subgraph op, // sub_block_idx < 0 means it's a new subgraph op int sub_block_idx = -(subgraph_idx + 1); diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 7cf65a23b8..7117e1b339 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -17,6 +17,7 @@ #include "lite/api/paddle_api.h" #include "lite/api/test_helper.h" #include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" DEFINE_string(model_file, "", "model file path of combined protobuf model"); DEFINE_string(params_file, "", "params file path of combined protobuf model"); @@ -31,43 +32,17 @@ namespace lite { // The helper functions for loading and running model from command line and // verifying output data std::vector TypeParsing(std::string text) { - std::vector types; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string type = text.substr(0, index); - VLOG(3) << type; - types.push_back(type); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); - } - } - return types; + return Split(text, ":"); } std::vector> ShapeParsing(std::string text) { std::vector> shapes; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string slice = text.substr(0, index); - std::vector shape; - while (!slice.empty()) { - size_t index = slice.find_first_of(","); - int d = atoi(slice.substr(0, index).c_str()); - VLOG(3) << d; - shape.push_back(d); - if (index == std::string::npos) { - break; - } else { - slice = slice.substr(index + 1); - } - } - shapes.push_back(shape); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); + std::vector shape_strings = Split(text, ":"); + shapes.resize(shape_strings.size()); + for (int i = 0; i < shape_strings.size(); i++) { + std::vector shape_nums = Split(shape_strings[i], ","); + for (auto shape_num : shape_nums) { + shapes[i].push_back(atoi(shape_num.c_str())); } } return shapes; diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc index 770ea345b6..d7b14a9319 100644 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() { << "[NPU] No input nodes found for building NPU model"; CHECK(!device_onames_.empty()) << "[NPU] No output nodes found for building NPU model"; + // Build the HiAI IR graph to HiAI om model as the device program - device_program_ = lite::npu::Device::Global().Build( + if (device_program_map_.count(inputs_shape_) > 0) { + return status; + } + auto device_client = lite::npu::Device::Global().Build( model_name_, device_inodes, device_onodes); - if (device_program_ == nullptr) { + if (device_client == nullptr) { LOG(WARNING) << "[NPU] Build model failed!"; return subgraph::FAILED; } + auto device_program = std::make_shared(device_client); + device_program_map_[inputs_shape_] = device_program; // Query and check the dimensions of valid input and output tensors std::vector device_idims, device_odims; - if (device_program_->GetModelIOTensorDim( + if (device_program->client->GetModelIOTensorDim( model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] Get the dimensions of input and output tensors failed!"; return subgraph::FAILED; } + device_program->device_idims = device_idims; + device_program->device_odims = device_odims; + CHECK_EQ(device_idims.size(), device_inames_.size()); CHECK_EQ(device_odims.size(), device_onames_.size()); origin_idims_.resize(device_inames_.size()); @@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() { origin_odims_.resize(device_onames_.size()); origin_otensors_.resize(device_onames_.size()); device_otensors_.resize(device_onames_.size()); + for (int i = 0; i < device_inames_.size(); i++) { auto node = graph.Get(device_inames_[i]); auto precision = node->precision(); @@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() { device_itensors_[i].reset(new hiai::AiTensor); device_itensors_[i]->Init(&(device_idims[i])); } + device_program->origin_idims = origin_idims_; + for (int i = 0; i < device_onames_.size(); i++) { auto node = graph.Get(device_onames_[i]); auto precision = node->precision(); @@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() { << PrecisionToStr(precision); break; } + device_program->origin_odims = origin_odims_; + CHECK_EQ(origin_odims_[i].production(), device_odims[i].GetNumber() * device_odims[i].GetChannel() * device_odims[i].GetHeight() * device_odims[i].GetWidth()); @@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() { int SubgraphEngine::LaunchDeviceProgram() { // Copy the data of origin input tensors to the buffer of input HiAI tensors + // init device_itensors_, device_otensors_, origin_otensors_ + auto device_program = device_program_map_[inputs_shape_]; for (size_t i = 0; i < device_itensors_.size(); i++) { + device_itensors_[i]->Init(&(device_program->device_idims[i])); std::memcpy(device_itensors_[i]->GetBuffer(), origin_itensors_[i]->raw_data(), origin_itensors_[i]->memory_size()); } + for (size_t i = 0; i < device_otensors_.size(); i++) { + device_otensors_[i]->Init(&(device_program->device_odims[i])); + } + for (size_t i = 0; i < origin_otensors_.size(); i++) { + origin_otensors_[i]->Resize(device_program->origin_odims[i]); + } + // Run the HiAI model by name std::string key = "model_name"; // Note: key seems must be model_name - model_context_.AddPara(key, model_name_); + hiai::AiContext model_context; + model_context.AddPara(key, model_name_); auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); @@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() { }; int istamp; auto start_time = GetCurrentUS(); - CHECK_EQ( - device_program_->Process( - model_context_, device_itensors_, device_otensors_, 1000, istamp), - hiai::AI_SUCCESS); + CHECK_EQ(device_program->client->Process( + model_context, device_itensors_, device_otensors_, 1000, istamp), + hiai::AI_SUCCESS); VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; + // Copy the data of output HiAI tensor to the buffer of origin output tensors for (size_t i = 0; i < device_otensors_.size(); i++) { std::memcpy(const_cast(origin_otensors_[i]->raw_data()), @@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() { return 0; } +bool SubgraphEngine::InputShapeChanged() { + std::vector> new_shape; + for (auto origin_itensor : origin_itensors_) { + new_shape.push_back(origin_itensor->dims().Vectorize()); + } + inputs_shape_ = new_shape; + if (device_program_map_.count(inputs_shape_) > 0) { + return false; + } + return true; +} + void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h index 29aeb01cdb..801f61b036 100644 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine { : subgraph::Engine( ctx, block_idx, block_desc, input_names, output_names, scope) {} + struct device_program_t { + explicit device_program_t(std::shared_ptr _client) + : client(_client) {} + std::shared_ptr client{nullptr}; + std::vector origin_idims{}; + std::vector origin_odims{}; + std::vector device_idims{}; + std::vector device_odims{}; + }; + protected: int BuildDeviceProgram() override; int LaunchDeviceProgram() override; + bool InputShapeChanged() override; - std::string model_name_; - hiai::AiContext model_context_; - std::vector device_inames_; - std::vector device_onames_; - std::vector> device_itensors_; - std::vector> device_otensors_; - std::unique_ptr device_program_{nullptr}; + std::string model_name_{"model.om"}; + std::vector> inputs_shape_{}; + std::map>, std::shared_ptr> + device_program_map_{}; + std::vector device_inames_{}; + std::vector device_onames_{}; + std::vector> device_itensors_{}; + std::vector> device_otensors_{}; }; class SubgraphCompute : public KernelLite { -- GitLab