未验证 提交 97b54fbe 编写于 作者: Q Qi Li 提交者: GitHub

[NPU] enhance cache offline model, test=develop (#3805)

* [NPU] enhance cache offline model, test=develop
上级 be7cc8f8
...@@ -120,3 +120,6 @@ metal/MobileNetDemo/MobileNetDemo/Resources ...@@ -120,3 +120,6 @@ metal/MobileNetDemo/MobileNetDemo/Resources
lite/model_parser/flatbuffers/framework_generated.h lite/model_parser/flatbuffers/framework_generated.h
build* build*
# hiai libs
ai_ddk_lib*
...@@ -35,7 +35,11 @@ endif() ...@@ -35,7 +35,11 @@ endif()
if(NOT DEFINED ANDROID_API_LEVEL) if(NOT DEFINED ANDROID_API_LEVEL)
set(ANDROID_API_LEVEL "23") set(ANDROID_API_LEVEL "23")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(ANDROID_API_LEVEL "22") if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH)
set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24
else()
set(ANDROID_API_LEVEL "22")
endif()
endif() endif()
endif() endif()
......
...@@ -73,6 +73,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -73,6 +73,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
} }
mode_ = config.power_mode(); mode_ = config.power_mode();
threads_ = config.threads(); threads_ = config.threads();
#ifdef LITE_WITH_NPU
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
int num_threads = config.x86_math_library_num_threads(); int num_threads = config.x86_math_library_num_threads();
......
...@@ -20,96 +20,122 @@ namespace paddle { ...@@ -20,96 +20,122 @@ namespace paddle {
namespace lite { namespace lite {
namespace npu { namespace npu {
bool WriteToOMFile(const domi::ModelBufferData& om_model_buff, std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
std::string om_file_path) { const std::string& model_name,
FILE* fp; std::vector<char>* model_buffer,
fp = fopen(om_file_path.c_str(), "wb"); bool* model_comp) {
CHECK(fp != nullptr) << om_file_path << " open failed!";
uint32_t write_size =
(uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !";
fclose(fp);
return true;
}
bool ReadFromOMFile(domi::ModelBufferData* om_model_buff,
std::string om_file_path) {
FILE* fp;
fp = fopen(om_file_path.c_str(), "rb");
CHECK(fp != nullptr) << om_file_path << " open failed!";
fseek(fp, 0, SEEK_END);
uint32_t model_length = (uint32_t)ftell(fp);
fseek(fp, 0, SEEK_SET);
om_model_buff->data = malloc(model_length);
om_model_buff->length = model_length;
uint32_t read_size =
(uint32_t)fread(om_model_buff->data, 1, model_length, fp);
CHECK_EQ(read_size, model_length) << "read om file failed !";
fclose(fp);
return true;
}
std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
const std::string model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
const std::string model_cache_full_dir = "" // NOLINT
) {
VLOG(3) << "[NPU] Build model";
// Build the HiAI IR graph to the HiAI om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
ge::Model om_model("model", "model");
om_model.SetGraph(ir_graph);
domi::HiaiIrBuild ir_build;
domi::ModelBufferData om_model_buf;
if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) {
VLOG(3) << "Will read om model from " << model_cache_full_dir;
ReadFromOMFile(&om_model_buf, model_cache_full_dir);
} else {
if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
LOG(WARNING) << "[NPU] CreateModelBuff failed!";
return nullptr;
}
if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
LOG(WARNING) << "[NPU] BuildIRModel failed!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
}
if (!model_cache_full_dir.empty()) {
VLOG(3) << "Will write om model to " << model_cache_full_dir;
WriteToOMFile(om_model_buf, model_cache_full_dir);
}
}
// Create a HiAI model manager client to load the HiAI om model // Create a HiAI model manager client to load the HiAI om model
std::shared_ptr<hiai::AiModelMngerClient> model_client( auto model_client = std::make_shared<hiai::AiModelMngerClient>();
new hiai::AiModelMngerClient());
if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; LOG(WARNING) << "[NPU] Init hiai model client failed!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr; return nullptr;
} }
// Check HiAI DDK version
const char* ddk_version = model_client->GetVersion();
if (ddk_version) {
LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
} else {
LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
}
// Check model compatibility
auto model_desc = std::make_shared<hiai::AiModelDescription>( auto model_desc = std::make_shared<hiai::AiModelDescription>(
model_name, freq_level(), framework_type(), model_type(), device_type()); model_name, freq_level(), framework_type(), model_type(), device_type());
model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); model_desc->SetModelBuffer(
std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs; reinterpret_cast<const void*>(model_buffer->data()),
model_descs.push_back(model_desc); model_buffer->size());
if (!*model_comp &&
model_client->CheckModelCompatibility(*model_desc, *model_comp) !=
hiai::AI_SUCCESS) {
*model_comp = false;
VLOG(3) << "[NPU] model is NOT compatiblitiable, setting model_comp to "
<< *model_comp;
} else {
*model_comp = true;
VLOG(3) << "[NPU] model is compatiblitiable, setting model_comp to "
<< *model_comp;
}
// Rebuild and write the data of the compatible model to the model buffer
if (!*model_comp) {
std::shared_ptr<hiai::AiModelBuilder> model_builder =
std::make_shared<hiai::AiModelBuilder>(model_client);
hiai::MemBuffer* org_model_buffer = model_builder->InputMemBufferCreate(
reinterpret_cast<void*>(model_buffer->data()), model_buffer->size());
if (org_model_buffer) {
std::vector<hiai::MemBuffer*> org_model_buffers;
org_model_buffers.push_back(org_model_buffer);
hiai::MemBuffer* new_model_buffer = model_builder->OutputMemBufferCreate(
framework_type(), org_model_buffers);
// VLOG(3) << "[NPU] new model buffer memeory size is " <<
// new_model_buffer->GetMemBufferSize();
if (new_model_buffer) {
uint32_t new_model_size = 0;
if (model_builder->BuildModel(org_model_buffers,
new_model_buffer,
new_model_size) == hiai::AI_SUCCESS) {
// need to change to new_model_size as GetMemBufferSize is not
// correct.
model_buffer->resize(new_model_size);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
new_model_buffer->GetMemBufferData(),
new_model_size);
// Reset the model buffer
model_desc->SetModelBuffer(
reinterpret_cast<const void*>(model_buffer->data()),
model_buffer->size());
VLOG(3) << "[NPU] Rebuild the compatible model done.";
} else {
LOG(WARNING) << "[NPU] Rebuild the compatible model failed!";
}
model_builder->MemBufferDestroy(new_model_buffer);
} else {
LOG(WARNING) << "[NPU] OutputMemBufferCreate failed!";
}
model_builder->MemBufferDestroy(org_model_buffer);
} else {
LOG(WARNING) << "[NPU] InputMemBufferCreate failed!";
}
}
// Load the compatible model
std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs{
model_desc};
if (model_client->Load(model_descs) != hiai::AI_SUCCESS) { if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!"; LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr; return nullptr;
} }
ir_build.ReleaseModelBuff(om_model_buf); VLOG(3) << "[NPU] Load model done.";
VLOG(3) << "[NPU] Build done";
return model_client; return model_client;
} }
bool Device::Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer) {
// Convert the HiAI IR graph to the HiAI om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
ge::Model om_model("model", "model");
om_model.SetGraph(ir_graph);
// Build the HiAI om model, serialize and output it to the om buffer
domi::HiaiIrBuild ir_build;
domi::ModelBufferData om_buffer;
if (!ir_build.CreateModelBuff(om_model, om_buffer)) {
LOG(WARNING) << "[NPU] CreateModelBuff failed!";
return false;
}
if (!ir_build.BuildIRModel(om_model, om_buffer)) {
LOG(WARNING) << "[NPU] BuildIRModel failed!";
ir_build.ReleaseModelBuff(om_buffer);
return false;
}
model_buffer->resize(om_buffer.length);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
reinterpret_cast<void*>(om_buffer.data),
om_buffer.length);
ir_build.ReleaseModelBuff(om_buffer);
VLOG(3) << "[NPU] Build model done.";
return true;
}
} // namespace npu } // namespace npu
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -38,14 +38,18 @@ class Device { ...@@ -38,14 +38,18 @@ class Device {
int model_type() { return model_type_; } int model_type() { return model_type_; }
int device_type() { return device_type_; } int device_type() { return device_type_; }
// Load the HiAI om model from buffer, rebuild the model if it's incompatible
// with the current device, then create a HiAI model manager client(from HiAI
// Server) to run inference
std::shared_ptr<hiai::AiModelMngerClient> Load(
const std::string& model_name,
std::vector<char>* model_buffer,
bool* model_comp);
// Build the HiAI IR graph to om model, return HiAI model manager client to // Build the HiAI IR graph to om model, return HiAI model manager client to
// load om model and run inference. // load om model and run inference.
std::shared_ptr<hiai::AiModelMngerClient> Build( bool Build(std::vector<ge::Operator>& input_nodes, // NOLINT
const std::string model_name, // NOLINT std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT std::vector<char>* model_buffer);
std::vector<ge::Operator>& output_nodes, // NOLINT
const std::string model_cache_name // NOLINT
); // NOLINT
private: private:
int freq_level_{3}; int freq_level_{3};
......
...@@ -425,42 +425,51 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, ...@@ -425,42 +425,51 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx); subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
// Extract input and output nodes from the target subgraph // Extract input and output nodes from the target subgraph
std::set<Node *> input_var_nodes; std::set<Node *> idata_var_nodes;
std::set<Node *> weight_var_nodes; std::set<Node *> weight_var_nodes;
std::set<Node *> output_var_nodes; std::set<Node *> odata_var_nodes;
std::set<Node *> local_var_nodes; std::set<Node *> local_var_nodes;
std::set<Node *> unused_var_nodes; std::set<Node *> unused_var_nodes;
ExtractInputsOutputs(subgraph_nodes, ExtractInputsOutputs(subgraph_nodes,
&input_var_nodes, &idata_var_nodes,
&weight_var_nodes, &weight_var_nodes,
&output_var_nodes, &odata_var_nodes,
&local_var_nodes, &local_var_nodes,
&unused_var_nodes); &unused_var_nodes);
// A simplified model without the original weight/local/unused nodes on the
// subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to
// true and Predictor->Run(...), Predictor->Save(...) is called.
std::set<Node *> input_var_nodes(idata_var_nodes.begin(),
idata_var_nodes.end());
std::set<Node *> output_var_nodes(odata_var_nodes.begin(),
odata_var_nodes.end());
if (!GetBoolFromEnv(SUBGRAPH_DISABLE_ONLINE_MODE)) {
input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end());
output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end());
output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end());
}
// Set input and output name mapping which stores the real inputs and // Set input and output name mapping which stores the real inputs and
// outputs // outputs
std::vector<std::string> input_var_names; std::vector<std::string> idata_var_names;
std::vector<std::string> output_var_names; std::vector<std::string> odata_var_names;
for (auto &var_node : input_var_nodes) { for (auto &var_node : idata_var_nodes) {
input_var_names.push_back(var_node->AsArg().name); idata_var_names.push_back(var_node->AsArg().name);
} }
for (auto &var_node : output_var_nodes) { for (auto &var_node : odata_var_nodes) {
output_var_names.push_back(var_node->AsArg().name); odata_var_names.push_back(var_node->AsArg().name);
} }
subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names", subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
input_var_names); idata_var_names);
subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names", subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
output_var_names); odata_var_names);
// Set all of the inputs and outputs to the target subgraph op // Set all of the inputs and outputs to the target subgraph op
// To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram() // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
for (auto &var_node : weight_var_nodes) { std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (auto &var_node : input_var_nodes) {
input_var_names.push_back(var_node->AsArg().name); input_var_names.push_back(var_node->AsArg().name);
} }
for (auto &var_node : local_var_nodes) { for (auto &var_node : output_var_nodes) {
output_var_names.push_back(var_node->AsArg().name);
}
for (auto &var_node : unused_var_nodes) {
output_var_names.push_back(var_node->AsArg().name); output_var_names.push_back(var_node->AsArg().name);
} }
subgraph_op_desc.SetInput("Inputs", input_var_names); subgraph_op_desc.SetInput("Inputs", input_var_names);
...@@ -500,26 +509,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, ...@@ -500,26 +509,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
for (auto &var_node : input_var_nodes) { for (auto &var_node : input_var_nodes) {
IR_NODE_LINK_TO(var_node, subgraph_op_node); IR_NODE_LINK_TO(var_node, subgraph_op_node);
} }
for (auto &var_node : weight_var_nodes) {
IR_NODE_LINK_TO(var_node, subgraph_op_node);
}
for (auto &var_node : output_var_nodes) { for (auto &var_node : output_var_nodes) {
IR_OP_VAR_LINK(subgraph_op_node, var_node); IR_OP_VAR_LINK(subgraph_op_node, var_node);
} }
for (auto &var_node : local_var_nodes) {
IR_OP_VAR_LINK(subgraph_op_node, var_node);
}
for (auto &var_node : unused_var_nodes) {
IR_OP_VAR_LINK(subgraph_op_node, var_node);
}
// Remove subgraph nodes and unused var nodes // Remove subgraph nodes and unused var nodes
auto nodes2rm = GetNodes2RM(subgraph_nodes, auto nodes2rm =
{input_var_nodes, GetNodes2RM(subgraph_nodes, {input_var_nodes, output_var_nodes});
weight_var_nodes,
output_var_nodes,
local_var_nodes,
unused_var_nodes});
GraphSafeRemoveNodes(graph, nodes2rm); GraphSafeRemoveNodes(graph, nodes2rm);
} }
...@@ -594,7 +590,17 @@ std::set<const Node *> GetNodes2RM( ...@@ -594,7 +590,17 @@ std::set<const Node *> GetNodes2RM(
std::set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end()); std::set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
for (auto &op_node : op_nodes) { for (auto &op_node : op_nodes) {
for (auto &var_node : op_node->inlinks) { for (auto &var_node : op_node->inlinks) {
if (!nodes2rm.count(var_node)) { bool skip = false;
// skip the var node which is used by any other ops that doesn't belong to
// the subgraph ops.
for (auto &out_op_node : var_node->outlinks) {
if (std::find(op_nodes.begin(), op_nodes.end(), out_op_node) !=
op_nodes.end()) {
skip = true;
break;
}
}
if (!skip && !nodes2rm.count(var_node)) {
nodes2rm.insert(var_node); nodes2rm.insert(var_node);
} }
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/engine.h"
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <algorithm>
#include <utility> #include <utility>
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
...@@ -22,11 +23,50 @@ namespace paddle { ...@@ -22,11 +23,50 @@ namespace paddle {
namespace lite { namespace lite {
namespace subgraph { namespace subgraph {
int Engine::BuildDeviceProgram() { return FAILED; } Engine::Engine(KernelContext *ctx,
int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
lite::Scope *scope)
: ctx_(ctx), block_idx_(block_idx), block_desc_(block_desc), scope_(scope) {
input_names_ = input_names;
output_names_ = output_names;
// Sort the name of input and output tensors, it's convenient for us to get
// the info of input and output tensors in the same order from the device
// program, because the result of subgraph division may be different but right
// at each call of the subgraph pass.
std::stable_sort(input_names_.begin(), input_names_.end());
std::stable_sort(output_names_.begin(), output_names_.end());
}
int Engine::LaunchDeviceProgram() { return 0; } bool Engine::Run() {
if (is_first_epoch_) {
PrepareWorkspaceForDeviceProgram();
is_first_epoch_ = false;
}
if (InputShapeChanged()) {
BuildDeviceProgram();
}
return LaunchDeviceProgram();
}
int Engine::BuildOriginProgram() { bool Engine::PrepareWorkspaceForOriginProgram() {
origin_idims_.resize(input_names_.size());
origin_itensors_.resize(input_names_.size());
for (int i = 0; i < input_names_.size(); i++) {
origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
CHECK(origin_itensors_[i]);
}
origin_otensors_.resize(output_names_.size());
for (int i = 0; i < output_names_.size(); i++) {
origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
CHECK(origin_otensors_[i]);
}
return true;
}
bool Engine::BuildOriginProgram() {
// TODO(hong19860320) The block_desc need to be divided into subgraphs during // TODO(hong19860320) The block_desc need to be divided into subgraphs during
// the exection time. But only see them as a subgraph now. // the exection time. But only see them as a subgraph now.
origin_program_.clear(); origin_program_.clear();
...@@ -34,11 +74,14 @@ int Engine::BuildOriginProgram() { ...@@ -34,11 +74,14 @@ int Engine::BuildOriginProgram() {
auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx); auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
CHECK(op_desc); CHECK(op_desc);
std::string op_type = op_desc->Type(); std::string op_type = op_desc->Type();
// Create op and pick up the best kernel
auto op = LiteOpRegistry::Global().Create(op_desc->Type()); auto op = LiteOpRegistry::Global().Create(op_desc->Type());
CHECK(op) << "no Op found for " << op_type;
op->Attach(*op_desc, scope_); op->Attach(*op_desc, scope_);
std::unique_ptr<KernelBase> picked_kernel; std::unique_ptr<KernelBase> picked_kernel;
if (op_desc->HasAttr(kKernelTypeAttr)) { if (op_desc->HasAttr(kKernelTypeAttr)) {
// Create op and pick up kernel according to the kKernelTypeAttr attribute // Create op and pick up the best kernel according to the
// kKernelTypeAttr attribute
auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr); auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
std::string alias; std::string alias;
Place place; Place place;
...@@ -48,12 +91,14 @@ int Engine::BuildOriginProgram() { ...@@ -48,12 +91,14 @@ int Engine::BuildOriginProgram() {
auto kernels = op->CreateKernels({place}); auto kernels = op->CreateKernels({place});
CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type; CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type;
auto it = std::find_if( auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) { kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase> &it) {
return it->alias() == alias; return it->alias() == alias;
}); });
CHECK(it != kernels.end()); CHECK(it != kernels.end());
picked_kernel = std::move(*it); picked_kernel = std::move(*it);
} else { } else {
// TODO(hong19860320) add kernel picking according to the type of input
// and output tensors
VLOG(3) << "The attr '" << kKernelTypeAttr VLOG(3) << "The attr '" << kKernelTypeAttr
<< "' not found, pick the first kernel for " << op_type; << "' not found, pick the first kernel for " << op_type;
std::vector<std::unique_ptr<KernelBase>> kernels; std::vector<std::unique_ptr<KernelBase>> kernels;
...@@ -74,52 +119,41 @@ int Engine::BuildOriginProgram() { ...@@ -74,52 +119,41 @@ int Engine::BuildOriginProgram() {
} }
origin_program_.emplace_back(std::move(op), std::move(picked_kernel)); origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
} }
return 0; CHECK(!origin_program_.empty()) << "no instructions";
return true;
} }
int Engine::LaunchOriginProgram() { bool Engine::LaunchOriginProgram() {
for (auto& inst : origin_program_) { if (origin_program_.empty()) {
auto op_type = inst.op()->op_info()->Type(); BuildOriginProgram();
if (op_type == "feed" || op_type == "fetch") continue; }
inst.Run(); if (!origin_program_.empty()) {
for (auto &inst : origin_program_) {
auto op_type = inst.op()->op_info()->Type();
if (op_type == "feed" || op_type == "fetch") continue;
inst.Run();
}
return true;
} }
return 0; return false;
} }
int Engine::Build() { bool Engine::PrepareWorkspaceForDeviceProgram() {
// In order to attach all of the ops of the block desc, we need to build the return PrepareWorkspaceForOriginProgram();
// original program firstly.
BuildOriginProgram();
// Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
build_device_program_status_ = BuildDeviceProgram();
return build_device_program_status_;
} }
void Engine::InitDeviceTensor() { return; } bool Engine::BuildDeviceProgram() { return BuildOriginProgram(); }
bool Engine::LaunchDeviceProgram() { return LaunchOriginProgram(); }
bool Engine::InputShapeChanged() { bool Engine::InputShapeChanged() {
bool changed = false;
for (size_t i = 0; i < origin_itensors_.size(); i++) { for (size_t i = 0; i < origin_itensors_.size(); i++) {
if (origin_itensors_[i]->dims() != origin_idims_[i]) { auto origin_idim = origin_itensors_[i]->dims().Vectorize();
return true; changed |= origin_idim != origin_idims_[i];
} origin_idims_[i] = origin_idim;
}
return false;
}
int Engine::Launch() {
// Rebuild device program when the shapes of input tensors have been changed.
if (CHECK_SUCCESS(build_device_program_status_) &&
CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
InputShapeChanged()) {
Build();
InitDeviceTensor();
}
if (CHECK_FAILED(build_device_program_status_)) {
LaunchOriginProgram();
} else {
LaunchDeviceProgram();
} }
return 0; return changed;
} }
} // namespace subgraph } // namespace subgraph
......
...@@ -33,49 +33,36 @@ class Engine { ...@@ -33,49 +33,36 @@ class Engine {
cpp::BlockDesc *block_desc, cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names, const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names, const std::vector<std::string> &output_names,
lite::Scope *scope, lite::Scope *scope);
std::string model_cache_dir = "")
: ctx_(ctx),
block_idx_(block_idx),
block_desc_(block_desc),
input_names_(input_names),
output_names_(output_names),
scope_(scope),
model_cache_dir_(model_cache_dir) {}
virtual ~Engine() = default; virtual ~Engine() = default;
virtual int Build(); virtual bool Run();
virtual int Launch();
private: private:
Engine(const Engine &) = delete; Engine(const Engine &) = delete;
protected: protected:
virtual int BuildDeviceProgram(); virtual bool PrepareWorkspaceForOriginProgram();
virtual int LaunchDeviceProgram(); virtual bool BuildOriginProgram();
virtual bool LaunchOriginProgram();
virtual int BuildOriginProgram(); virtual bool PrepareWorkspaceForDeviceProgram();
virtual int LaunchOriginProgram(); virtual bool BuildDeviceProgram();
virtual bool LaunchDeviceProgram();
virtual void InitDeviceTensor();
virtual bool InputShapeChanged(); virtual bool InputShapeChanged();
KernelContext *ctx_{nullptr}; KernelContext *ctx_{nullptr};
int block_idx_; int block_idx_{-1};
cpp::BlockDesc *block_desc_; cpp::BlockDesc *block_desc_{nullptr};
std::vector<std::string> input_names_; std::vector<std::string> input_names_;
std::vector<std::string> output_names_; std::vector<std::string> output_names_;
Scope *scope_{nullptr}; Scope *scope_{nullptr};
// SUCCESS: device program build successed. FAILED: device program build bool is_first_epoch_{true};
// failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need std::vector<std::vector<int64_t>> origin_idims_;
// to rebuild when input shape changed.
int build_device_program_status_{0};
std::vector<DDim> origin_idims_;
std::vector<DDim> origin_odims_;
std::vector<Tensor *> origin_itensors_; std::vector<Tensor *> origin_itensors_;
std::vector<Tensor *> origin_otensors_; std::vector<Tensor *> origin_otensors_;
std::vector<Instruction> origin_program_; std::vector<Instruction> origin_program_;
std::string model_cache_dir_{""};
}; };
} // namespace subgraph } // namespace subgraph
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "graph/op/all_ops.h" #include "graph/compatible/all_ops.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
......
...@@ -94,10 +94,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -94,10 +94,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
} else { } else {
matmul_node = graph->Add<ge::op::BatchMatMul>(out_name); matmul_node = graph->Add<ge::op::BatchMatMul>(out_name);
auto matmul_op = matmul_node->data<ge::op::BatchMatMul>(); auto matmul_op = matmul_node->data<ge::op::BatchMatMul>();
matmul_op->set_input_x(*x_node->data()); matmul_op->set_input_x1(*x_node->data());
matmul_op->set_input_y(*y_node->data()); matmul_op->set_input_x2(*y_node->data());
matmul_op->set_attr_adj_x(transpose_x); matmul_op->set_attr_adj_x1(transpose_x);
matmul_op->set_attr_adj_y(transpose_y); matmul_op->set_attr_adj_x2(transpose_y);
} }
if (fabs(alpha - 1.f) > 1e-6f) { if (fabs(alpha - 1.f) > 1e-6f) {
......
...@@ -20,11 +20,11 @@ ...@@ -20,11 +20,11 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "graph/buffer.h" #include "graph/buffer.h"
#include "graph/compatible/operator_reg.h"
#include "graph/graph.h" #include "graph/graph.h"
#include "graph/model.h" #include "graph/model.h"
#include "graph/op/all_ops.h" #include "graph/op/all_ops.h"
#include "graph/operator.h" #include "graph/operator.h"
#include "graph/operator_reg.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/utils/macros.h" #include "lite/utils/macros.h"
...@@ -97,25 +97,26 @@ REG_OP(Pad) ...@@ -97,25 +97,26 @@ REG_OP(Pad)
/* /*
* Multiplies slices of two tensors in batches. * Multiplies slices of two tensors in batches.
* <Input> * <Input>
* x : The input tensor * x1 : The input tensor
* y : The input tensor * x2 : The input tensor
* <Output> * <Output>
* z : The output tensor * y : The output tensor
* <Attr> * <Attr>
* adj_x : adj_x is true, the input tensor x is transposed, otherwise * adj_x1 : adj_x1 is true, the input tensor x1 is transposed,
* it will not be transposed. Default is false (The current version only * otherwise it will not be transposed.
* supports false). * Default is false (The current version only supports false).
* adj_y : adj_y is true, the input tensor y is transposed, otherwise * adj_x2 : adj_x2 is true, the input tensor x2 is transposed,
* it will not be transposed. Default is false. * otherwise it will not be transposed.
* Default is false.
* <Added in HiAI version> * <Added in HiAI version>
* 100.320.010.010 * 100.320.010.010
*/ */
REG_OP(BatchMatMul) REG_OP(BatchMatMul)
.INPUT(x, TensorType({DT_FLOAT})) .INPUT(x1, TensorType({DT_FLOAT}))
.INPUT(y, TensorType({DT_FLOAT})) .INPUT(x2, TensorType({DT_FLOAT}))
.OUTPUT(z, TensorType({DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT}))
.ATTR(adj_x, AttrValue::BOOL{false}) .ATTR(adj_x1, AttrValue::BOOL{false})
.ATTR(adj_y, AttrValue::BOOL{false}) .ATTR(adj_x2, AttrValue::BOOL{false})
.OP_END() .OP_END()
} // namespace ge } // namespace ge
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <algorithm> #include <algorithm>
#include <functional>
#include <utility> #include <utility>
#include "hiai_ir_build.h" // NOLINT #include "hiai_ir_build.h" // NOLINT
#include "lite/backends/npu/device.h" #include "lite/backends/npu/device.h"
...@@ -24,205 +25,275 @@ ...@@ -24,205 +25,275 @@
#include "lite/kernels/npu/bridges/paddle_use_bridges.h" #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
#include "lite/kernels/npu/bridges/utility.h" #include "lite/kernels/npu/bridges/utility.h"
#include "lite/utils/io.h" #include "lite/utils/io.h"
#include "lite/utils/md5.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace npu { namespace npu {
std::string SubgraphEngine::GenerateModelCacheName() const { // Generate the model name by using md5 hashes based on:
auto inames = device_inames_; // 1. the sorted variable input names
auto onames = device_onames_; // 2. the shapes of the origin input tensors
std::stable_sort(inames.begin(), inames.end()); // 3. the sorted variable output names
std::string DeviceProgram::GenerateModelName(
std::string model_cache_name = "subgraph_" + std::to_string(block_idx_); const std::vector<std::string>& input_names,
for (auto iname : inames) { const std::vector<std::string>& output_names,
model_cache_name += "_"; const std::vector<std::vector<int64_t>>& origin_idims) {
auto itensor = scope_->FindTensor(iname); std::ostringstream os;
int tmp = 0; CHECK_EQ(input_names.size(), origin_idims.size());
for (auto i : itensor->dims().Vectorize()) { for (int i = 0; i < input_names.size(); i++) {
tmp += i * i; os << input_names[i];
for (auto dim : origin_idims[i]) {
os << dim;
} }
model_cache_name += std::to_string(tmp % 1999);
} }
model_cache_name += "_.om"; for (auto output_name : output_names) {
os << output_name;
}
return MD5(os.str());
}
return model_cache_name; // Deserialize the generated model, the precisions and dimensions of the origin
// output tensors of the subgraph op into files
bool DeviceProgram::LoadFromCacheFile(
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
const std::vector<std::vector<int64_t>>& origin_idims,
const std::string& model_cache_dir) {
// Generate the model name if not initialized
if (model_name_.empty()) {
model_name_ = GenerateModelName(input_names, output_names, origin_idims);
}
// Load from the cached model file, return a HiAI model manager client for
// inference
auto model_path = model_cache_dir + "/" + model_name_ + ".om";
VLOG(3) << "[NPU] Load model from " << model_path;
std::vector<char> model_buffer;
if (!ReadFile(model_path, &model_buffer)) {
LOG(WARNING) << "[NPU] read from " << model_path << " failed!";
return false;
}
bool model_comp = false;
model_client_ =
lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
if (!model_client_) {
LOG(WARNING) << "[NPU] Load model failed!";
return false;
}
// Rewrite with the compatible model data if the cached
// model file is incompatible with the current device
if (!model_comp) {
VLOG(3) << "[NPU] Export the compatible model to " << model_path;
if (!WriteFile(model_path, model_buffer)) {
LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
}
}
// Deserialize the precisions and shapes of the origin output tensors from the
// cached configuration file
auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
VLOG(3) << "[NPU] Load configuration from " << config_path;
std::vector<char> config_buffer;
if (!ReadFile(config_path, &config_buffer)) {
LOG(WARNING) << "[NPU] read from " << config_path << " failed!";
return false;
}
std::string config_str(config_buffer.begin(), config_buffer.end());
// Parse the precision and shapes of the output tensors
auto output_options = Split<std::string>(config_str, ";");
CHECK_EQ(output_options.size(), output_names.size());
origin_otypes_.resize(output_names.size());
origin_odims_.resize(output_names.size());
for (int i = 0; i < output_names.size(); i++) {
auto items = Split<std::string>(output_options[i], ":");
CHECK_EQ(items.size(), 2); // precision and shapes
origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
origin_odims_[i] = Split<int64_t>(items[1], ",");
}
return true;
} }
int SubgraphEngine::BuildDeviceProgram() { bool DeviceProgram::BuildGraphAndCacheToFile(
const std::vector<Instruction>& origin_program,
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
const std::vector<std::vector<int64_t>>& origin_idims,
const std::vector<Tensor*>& origin_otensors,
const std::string& model_cache_dir) {
// Generate the model name if not initialized
if (model_name_.empty()) {
model_name_ = GenerateModelName(input_names, output_names, origin_idims);
}
// Convert all of ops and their input vars and weights to HiAI IR nodes,
// then added them into the HiAI IR graph
int status = 0; int status = 0;
// Convert all of ops and their input vars and weights and added into the NPU CHECK(!origin_program.empty()) << "no instructions";
// HiAI IR graph
subgraph::npu::Graph graph; subgraph::npu::Graph graph;
const auto& bridges = subgraph::Registry::Instance(); const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) { for (auto& inst : origin_program) {
auto op = const_cast<OpLite*>(inst.op()); auto op = const_cast<OpLite*>(inst.op());
CHECK(op); CHECK(op);
op->CheckShape(); op->CheckShape();
op->InferShape(); op->InferShape();
std::string op_type = op->op_info()->Type(); std::string op_type = op->op_info()->Type();
if (!bridges.Exists(op_type, TARGET(kNPU))) { if (!bridges.Exists(op_type, TARGET(kNPU))) {
return subgraph::FAILED; return false;
} }
auto kernel = inst.kernel(); auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kNPU))( status |= bridges.Select(op_type, TARGET(kNPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel)); reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) { if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED; return false;
} }
} }
// Collect the valid input and output nodes in the HiAI IR graph and update // Collect the input and output nodes of the HiAI IR graph
// the input and output names
device_inames_.clear();
device_onames_.clear();
std::vector<ge::Operator> device_inodes; std::vector<ge::Operator> device_inodes;
for (size_t i = 0; i < input_names.size(); i++) {
CHECK(graph.Has(input_names[i]) && graph.Get(input_names[i])->is_data());
device_inodes.push_back(*graph.Get(input_names[i])->data());
}
std::vector<ge::Operator> device_onodes; std::vector<ge::Operator> device_onodes;
for (auto& input_name : input_names_) { for (size_t i = 0; i < output_names.size(); i++) {
if (graph.Has(input_name)) { CHECK(graph.Has(output_names[i]));
if (graph.Get(input_name)->is_data()) { device_onodes.push_back(*graph.Get(output_names[i])->data());
device_inodes.push_back(*graph.Get(input_name)->data());
device_inames_.push_back(input_name);
} else {
LOG(WARNING) << "[NPU] Input node " << input_name
<< " is ignored because it is not a data node.";
}
} else {
LOG(WARNING) << "[NPU] Input node " << input_name
<< " is ignored because it does not exist.";
}
} }
for (auto& output_name : output_names_) { // Build the HiAI IR graph to the HiAI om model
if (graph.Has(output_name)) { std::vector<char> model_buffer;
device_onodes.push_back(*graph.Get(output_name)->data()); if (!lite::npu::Device::Global().Build(
device_onames_.push_back(output_name); device_inodes, device_onodes, &model_buffer)) {
} else { LOG(WARNING) << "[NPU] Build model failed!";
LOG(WARNING) << "[NPU] Output node " << output_name return false;
<< " is ignored because it does not exist.";
}
} }
CHECK(!device_inames_.empty()) // Load the HiAI om model and create a HiAI model manager client(from HiAI
<< "[NPU] No input nodes found for building NPU model"; // Service) to run inference.
CHECK(!device_onames_.empty()) bool model_comp = true;
<< "[NPU] No output nodes found for building NPU model"; model_client_ =
lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
// Build the HiAI IR graph to HiAI om model as the device program if (!model_client_) {
if (device_program_map_.count(inputs_shape_) > 0) { LOG(WARNING) << "[NPU] Load model failed!";
return status; return false;
} }
std::string model_cache_full_dir = // Update the precison and dimensions of the origin output tensors
model_cache_dir_.empty() ? "" : model_cache_dir_ + "/" + CHECK_EQ(origin_otensors.size(), output_names.size());
GenerateModelCacheName(); origin_otypes_.resize(output_names.size());
auto device_client = lite::npu::Device::Global().Build( origin_odims_.resize(output_names.size());
model_name_, device_inodes, device_onodes, model_cache_full_dir); for (size_t i = 0; i < output_names.size(); i++) {
if (device_client == nullptr) { origin_otypes_[i] = graph.Get(output_names[i])->precision();
LOG(WARNING) << "[NPU] Build model failed!"; origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
return subgraph::FAILED;
} }
auto device_program = std::make_shared<device_program_t>(device_client); if (!model_cache_dir.empty()) {
if (!inputs_shape_.empty()) { // Save the generated model to file, used for the model caching or the
device_program_map_[inputs_shape_] = device_program; // offline model generation
auto model_path = model_cache_dir + "/" + model_name_ + ".om";
VLOG(3) << "[NPU] Save model to " << model_path;
if (!WriteFile(model_path, model_buffer)) {
LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
}
// Serialize the precisions and shapes of the origin output tensors into the
// configuration file
std::ostringstream os;
for (int i = 0; i < output_names.size(); i++) {
os << static_cast<int32_t>(origin_otypes_[i]) << ":";
for (auto dim : origin_odims_[i]) {
os << dim << ",";
}
os << ";";
}
auto str = os.str();
std::vector<char> config_buffer(str.begin(), str.end());
auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
VLOG(3) << "[NPU] Save configuration to " << config_path;
if (!WriteFile(config_path, config_buffer)) {
LOG(WARNING) << "[NPU] Open " << config_path << " for writting failed!";
}
} }
return true;
}
// Query and check the dimensions of valid input and output tensors bool DeviceProgram::ShareBufferWithOriginTensors(
std::vector<hiai::TensorDimension> device_idims, device_odims; const std::vector<std::string>& input_names,
if (device_program->client->GetModelIOTensorDim( const std::vector<std::string>& output_names,
model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) { std::vector<Tensor*>* origin_itensors,
LOG(WARNING) std::vector<Tensor*>* origin_otensors,
<< "[NPU] Get the dimensions of input and output tensors failed!"; std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
return subgraph::FAILED; std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
CHECK(!model_name_.empty() && model_client_);
// Query the dimensions of the device input and output tensors if not
// initialized
if (device_idims_.empty() || device_odims_.empty()) {
if (model_client_->GetModelIOTensorDim(
model_name_, device_idims_, device_odims_) != hiai::AI_SUCCESS) {
LOG(WARNING)
<< "[NPU] Get the dimensions of input and output tensors failed!";
return false;
}
} }
device_program->device_idims = device_idims; // Check the dimensions of the device tensors and the origin tensors
device_program->device_odims = device_odims; CHECK_EQ(device_itensors->size(), input_names.size());
CHECK_EQ(device_otensors->size(), output_names.size());
CHECK_EQ(origin_otypes_.size(), output_names.size());
CHECK_EQ(origin_odims_.size(), output_names.size());
CHECK_EQ(device_idims_.size(), input_names.size());
CHECK_EQ(device_odims_.size(), output_names.size());
for (int i = 0; i < input_names.size(); i++) {
VLOG(3) << "[NPU] Inputs[" << i << "] name: " << input_names[i]
<< " origin dims:" << (*origin_itensors)[i]->dims().repr()
<< " device dims: {" << device_idims_[i].GetNumber() << ","
<< device_idims_[i].GetChannel() << ","
<< device_idims_[i].GetHeight() << ","
<< device_idims_[i].GetWidth() << "}";
CHECK_EQ((*origin_itensors)[i]->dims().production(),
device_idims_[i].GetNumber() * device_idims_[i].GetChannel() *
device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
VLOG(3) << "[NPU] Init the input tensors for the device program and share "
"their buffers with the origin input tensors";
// reinit device tensor will free shared buffer, so copy data to a tmp
// tensor
Tensor tmp;
tmp.CopyDataFrom(*(*origin_itensors)[i]);
(*device_itensors)[i]->Init(&(device_idims_[i]));
CHECK_EQ(device_idims.size(), device_inames_.size()); std::memcpy(
CHECK_EQ(device_odims.size(), device_onames_.size()); (*device_itensors)[i]->GetBuffer(), tmp.raw_data(), tmp.memory_size());
origin_idims_.resize(device_inames_.size());
origin_itensors_.resize(device_inames_.size());
device_itensors_.resize(device_inames_.size());
origin_odims_.resize(device_onames_.size());
origin_otensors_.resize(device_onames_.size());
device_otensors_.resize(device_onames_.size());
for (int i = 0; i < device_inames_.size(); i++) { // Share data buf between device_itensor and origin_itensor
auto node = graph.Get(device_inames_[i]); std::shared_ptr<Buffer> buffer =
auto precision = node->precision(); std::make_shared<Buffer>((*device_itensors)[i]->GetBuffer(),
auto layout = node->layout(); lite_api::TargetType::kHost,
origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); (*device_itensors)[i]->GetSize());
CHECK(origin_itensors_[i]); (*origin_itensors)[i]->ResetBuffer(buffer,
origin_idims_[i] = origin_itensors_[i]->dims(); (*device_itensors)[i]->GetSize());
VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i]
<< " precision: " << PrecisionToStr(precision)
<< " layout: " << DataLayoutToStr(layout) << " dims: {"
<< device_idims[i].GetNumber() << ","
<< device_idims[i].GetChannel() << ","
<< device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
<< "}";
// Prepare the device input tensors
CHECK_EQ(origin_idims_[i].production(),
device_idims[i].GetNumber() * device_idims[i].GetChannel() *
device_idims[i].GetHeight() * device_idims[i].GetWidth());
device_itensors_[i].reset(new hiai::AiTensor);
device_itensors_[i]->Init(&(device_idims[i]));
} }
device_program->origin_idims = origin_idims_; for (int i = 0; i < output_names.size(); i++) {
(*origin_otensors)[i]->set_precision(origin_otypes_[i]);
for (int i = 0; i < device_onames_.size(); i++) { (*origin_otensors)[i]->Resize(origin_odims_[i]);
auto node = graph.Get(device_onames_[i]); VLOG(3) << "[NPU] Outputs[" << i << "] name: " << output_names[i]
auto precision = node->precision(); << " origin dims:" << (*origin_otensors)[i]->dims().repr()
auto layout = node->layout(); << " device dims: {" << device_odims_[i].GetNumber() << ","
origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); << device_odims_[i].GetChannel() << ","
CHECK(origin_otensors_[i]); << device_odims_[i].GetHeight() << ","
origin_odims_[i] = origin_otensors_[i]->dims(); << device_odims_[i].GetWidth() << "}";
VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i] CHECK_EQ((*origin_otensors)[i]->dims().production(),
<< " precision: " << PrecisionToStr(precision) device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
<< " layout: " << DataLayoutToStr(layout) << " dims: {" device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
<< device_odims[i].GetNumber() << "," (*device_otensors)[i]->Init(&(device_odims_[i]));
<< device_odims[i].GetChannel() << "," VLOG(3) << "[NPU] Init the output tensors for the device program and share "
<< device_odims[i].GetHeight() << "," << device_odims[i].GetWidth() "their buffers with the origin output tensors";
<< "}"; // Share data buf between device_itensor and origin_itensor
// Prepare the device output tensors std::shared_ptr<Buffer> buffer =
switch (precision) { std::make_shared<Buffer>((*device_otensors)[i]->GetBuffer(),
case PRECISION(kFloat): lite_api::TargetType::kHost,
origin_otensors_[i]->mutable_data<float>(); (*device_otensors)[i]->GetSize());
break; (*origin_otensors)[i]->ResetBuffer(buffer,
case PRECISION(kBool): (*device_otensors)[i]->GetSize());
origin_otensors_[i]->mutable_data<bool>();
break;
case PRECISION(kInt8):
origin_otensors_[i]->mutable_data<int8_t>();
break;
case PRECISION(kInt16):
origin_otensors_[i]->mutable_data<int16_t>();
break;
case PRECISION(kInt32):
origin_otensors_[i]->mutable_data<int32_t>();
break;
case PRECISION(kInt64):
origin_otensors_[i]->mutable_data<int64_t>();
break;
default:
LOG(FATAL) << "[NPU] " << device_onames_[i]
<< " can't mutable data with precision type "
<< PrecisionToStr(precision);
break;
}
device_program->origin_odims = origin_odims_;
CHECK_EQ(origin_odims_[i].production(),
device_odims[i].GetNumber() * device_odims[i].GetChannel() *
device_odims[i].GetHeight() * device_odims[i].GetWidth());
device_otensors_[i].reset(new hiai::AiTensor);
device_otensors_[i]->Init(&(device_odims[i]));
} }
return status; return true;
} }
int SubgraphEngine::LaunchDeviceProgram() { bool DeviceProgram::ZeroCopyRun(
// Copy the data of origin input tensors to the buffer of input HiAI tensors std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
// init device_itensors_, device_otensors_, origin_otensors_ std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
auto device_program = device_program_map_[inputs_shape_]; CHECK(!model_name_.empty() && model_client_);
// Run the HiAI model by name // Run the HiAI model by name
std::string key = "model_name"; // Note: key seems must be model_name std::string key = "model_name"; // Note: key seems must be model_name
hiai::AiContext model_context; hiai::AiContext model_context;
...@@ -234,70 +305,87 @@ int SubgraphEngine::LaunchDeviceProgram() { ...@@ -234,70 +305,87 @@ int SubgraphEngine::LaunchDeviceProgram() {
}; };
int istamp; int istamp;
auto start_time = GetCurrentUS(); auto start_time = GetCurrentUS();
CHECK_EQ(device_program->client->Process( CHECK_EQ(model_client_->Process(
model_context, device_itensors_, device_otensors_, 1000, istamp), model_context, *device_itensors, *device_otensors, 1000, istamp),
hiai::AI_SUCCESS); hiai::AI_SUCCESS);
VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
return true;
return 0;
} }
int SubgraphEngine::Build() { bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
if (device_program_map_.count(inputs_shape_) > 0) { // Obtain the origin input tensors, and create the origin output
return subgraph::SUCCESS; // tensors(Don't try to access them before launch the device program or the
// origin program)
PrepareWorkspaceForOriginProgram();
// Create the device input and output tensors, but don't initialize them
// with the dimensions
device_itensors_.resize(input_names_.size());
for (int i = 0; i < input_names_.size(); i++) {
device_itensors_[i].reset(new hiai::AiTensor);
CHECK(device_itensors_[i]);
}
device_otensors_.resize(output_names_.size());
for (int i = 0; i < output_names_.size(); i++) {
device_otensors_[i].reset(new hiai::AiTensor);
CHECK(device_otensors_[i]);
} }
// In order to attach all of the ops of the block desc, we need to build the return true;
// original program firstly.
BuildOriginProgram();
// Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
build_device_program_status_ = BuildDeviceProgram();
return build_device_program_status_;
} }
void SubgraphEngine::InitDeviceTensor() { bool SubgraphEngine::BuildDeviceProgram() {
auto device_program = device_program_map_[inputs_shape_]; // Check if the cache device program exists
for (size_t i = 0; i < device_itensors_.size(); i++) { if (!device_programs_.count(origin_idims_)) {
if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) { auto device_program = std::make_shared<DeviceProgram>();
VLOG(3) << "init device_itensors and share input tensor buf between " // Obtain the model cache dir from the NPU Context of the subgraph op
"device and host"; auto model_cache_dir = ctx_->As<NPUContext>().SubgraphModelCacheDir();
device_itensors_[i]->Init(&(device_program->device_idims[i])); VLOG(3) << "[NPU] Getting subgraph model_cache_dir is: " << model_cache_dir;
std::memcpy(device_itensors_[i]->GetBuffer(), // Check and load if the cached model and configuration file exists
origin_itensors_[i]->raw_data(), if (model_cache_dir.empty() ||
origin_itensors_[i]->memory_size()); !device_program->LoadFromCacheFile(
// share data buf between device_itensor and origin_itensor input_names_, output_names_, origin_idims_, model_cache_dir)) {
std::shared_ptr<Buffer> buffer = // Build the model online, including converting the paddle ops to the HiAI
std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(), // IR nodes, building the HiAI IR graph to the om model, then load it as a
lite_api::TargetType::kHost, // new HiAI model manager client for inference.
device_itensors_[i]->GetSize()); if (origin_program_.empty()) {
origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize()); BuildOriginProgram();
}
CHECK(!origin_program_.empty()) << "no instructions";
if (!device_program->BuildGraphAndCacheToFile(origin_program_,
input_names_,
output_names_,
origin_idims_,
origin_otensors_,
model_cache_dir)) {
return false;
}
} }
} if (device_program->model_client_ == nullptr) {
for (size_t i = 0; i < device_otensors_.size(); i++) { return false;
if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
VLOG(3) << "init device_otensors and share output tensor buf between "
"device and host";
device_otensors_[i]->Init(&(device_program->device_odims[i]));
// share data buf between device_itensor and origin_itensor
origin_otensors_[i]->Resize(device_program->origin_odims[i]);
std::shared_ptr<Buffer> buffer =
std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
lite_api::TargetType::kHost,
device_otensors_[i]->GetSize());
origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
} }
device_programs_[origin_idims_] = device_program;
} }
auto device_program = device_programs_[origin_idims_];
CHECK(device_program && device_program->model_client_);
return device_program->ShareBufferWithOriginTensors(input_names_,
output_names_,
&origin_itensors_,
&origin_otensors_,
&device_itensors_,
&device_otensors_);
} }
bool SubgraphEngine::InputShapeChanged() { bool SubgraphEngine::LaunchDeviceProgram() {
std::vector<std::vector<int64_t>> new_shape; // Roll back to launch the origin program if the device program can't be
for (auto origin_itensor : origin_itensors_) { // found or the model client isn't initialized.
new_shape.push_back(origin_itensor->dims().Vectorize()); if (device_programs_.count(origin_idims_) == 0 ||
device_programs_[origin_idims_]->model_client_ == nullptr) {
return LaunchOriginProgram();
} }
if (inputs_shape_ == new_shape) { auto device_program = device_programs_[origin_idims_];
return false; if (!device_program->model_client_) {
return LaunchOriginProgram();
} }
inputs_shape_ = new_shape; return device_program->ZeroCopyRun(&device_itensors_, &device_otensors_);
return true;
} }
void SubgraphCompute::PrepareForRun() { void SubgraphCompute::PrepareForRun() {
...@@ -307,15 +395,13 @@ void SubgraphCompute::PrepareForRun() { ...@@ -307,15 +395,13 @@ void SubgraphCompute::PrepareForRun() {
param.sub_block_desc, param.sub_block_desc,
param.input_data_names, param.input_data_names,
param.output_data_names, param.output_data_names,
param.scope, param.scope));
NPUContext::SubgraphModelCacheDir()));
CHECK(engine_); CHECK(engine_);
engine_->Build();
} }
void SubgraphCompute::Run() { void SubgraphCompute::Run() {
CHECK(engine_); CHECK(engine_);
engine_->Launch(); engine_->Run();
} }
} // namespace npu } // namespace npu
......
...@@ -28,52 +28,65 @@ namespace lite { ...@@ -28,52 +28,65 @@ namespace lite {
namespace kernels { namespace kernels {
namespace npu { namespace npu {
class SubgraphEngine : public subgraph::Engine { class DeviceProgram {
public: public:
SubgraphEngine(KernelContext *ctx, DeviceProgram() {}
int block_idx, ~DeviceProgram() {}
cpp::BlockDesc *block_desc, std::string GenerateModelName(
const std::vector<std::string> &input_names, const std::vector<std::string>& input_names,
const std::vector<std::string> &output_names, const std::vector<std::string>& output_names,
Scope *scope, const std::vector<std::vector<int64_t>>& origin_idims);
std::string model_cache_dir = "") bool LoadFromCacheFile(const std::vector<std::string>& input_names,
: subgraph::Engine(ctx, const std::vector<std::string>& output_names,
block_idx, const std::vector<std::vector<int64_t>>& origin_idims,
block_desc, const std::string& model_cache_dir);
input_names, bool BuildGraphAndCacheToFile(
output_names, const std::vector<Instruction>& origin_program,
scope, const std::vector<std::string>& input_names,
model_cache_dir) {} const std::vector<std::string>& output_names,
const std::vector<std::vector<int64_t>>& origin_idims,
const std::vector<Tensor*>& origin_otensors,
const std::string& model_cache_dir);
bool ShareBufferWithOriginTensors(
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
std::vector<Tensor*>* origin_itensors,
std::vector<Tensor*>* origin_otensors,
std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors);
bool ZeroCopyRun(
std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors);
struct device_program_t { public:
explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client) std::string model_name_{""};
: client(_client) {} std::shared_ptr<hiai::AiModelMngerClient> model_client_{nullptr};
std::shared_ptr<hiai::AiModelMngerClient> client{nullptr}; std::vector<std::vector<int64_t>> origin_odims_;
std::vector<DDim> origin_idims{}; std::vector<PrecisionType> origin_otypes_;
std::vector<DDim> origin_odims{}; std::vector<hiai::TensorDimension> device_idims_{};
std::vector<hiai::TensorDimension> device_idims{}; std::vector<hiai::TensorDimension> device_odims_{};
std::vector<hiai::TensorDimension> device_odims{}; };
};
int Build() override; class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(KernelContext* ctx,
int block_idx,
cpp::BlockDesc* block_desc,
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
Scope* scope)
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected: protected:
int BuildDeviceProgram() override; bool PrepareWorkspaceForDeviceProgram() override;
int LaunchDeviceProgram() override; bool BuildDeviceProgram() override;
bool LaunchDeviceProgram() override;
void InitDeviceTensor() override;
bool InputShapeChanged() override;
std::string GenerateModelCacheName() const;
std::string model_name_{"model.om"};
std::vector<std::vector<int64_t>> inputs_shape_{};
std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
device_program_map_{};
std::vector<std::string> device_inames_{};
std::vector<std::string> device_onames_{};
std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{}; std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{}; std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
device_programs_;
}; };
class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> { class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \ #define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \
"SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE" "SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE"
#define SUBGRAPH_DISABLE_ONLINE_MODE "SUBGRAPH_DISABLE_ONLINE_MODE"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
...@@ -120,5 +120,40 @@ static std::vector<std::string> ListDir(const std::string& path, ...@@ -120,5 +120,40 @@ static std::vector<std::string> ListDir(const std::string& path,
return paths; return paths;
} }
static bool ReadFile(const std::string& filename, std::vector<char>* contents) {
FILE* fp = fopen(filename.c_str(), "rb");
if (!fp) return false;
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
fseek(fp, 0, SEEK_SET);
contents->clear();
contents->resize(size);
size_t offset = 0;
char* ptr = reinterpret_cast<char*>(&(contents->at(0)));
while (offset < size) {
size_t already_read = fread(ptr, 1, size - offset, fp);
offset += already_read;
ptr += already_read;
}
fclose(fp);
return true;
}
static bool WriteFile(const std::string& filename,
const std::vector<char>& contents) {
FILE* fp = fopen(filename.c_str(), "wb");
if (!fp) return false;
size_t size = contents.size();
size_t offset = 0;
const char* ptr = reinterpret_cast<const char*>(&(contents.at(0)));
while (offset < size) {
size_t already_written = fwrite(ptr, 1, size - offset, fp);
offset += already_written;
ptr += already_written;
}
fclose(fp);
return true;
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
namespace paddle {
namespace lite {
std::string MD5(std::string message) {
const uint32_t shiftAmounts[] = {
7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
const uint32_t partsOfSines[] = {
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
uint32_t state[4];
state[0] = 0x67452301;
state[1] = 0xefcdab89;
state[2] = 0x98badcfe;
state[3] = 0x10325476;
// Pad with zeros
int size = ((((message.length() + 8) / 64) + 1) * 64) - 8;
uint8_t *buf = reinterpret_cast<uint8_t *>(calloc(size + 64, 1));
memcpy(buf, message.c_str(), message.length());
buf[message.length()] = 128;
uint32_t bits = 8 * message.length();
memcpy(buf + size, &bits, 4);
// Process at each 512-bit(64 bytes) chunk
#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
for (int offset = 0; offset < size; offset += 64) {
uint32_t A = state[0];
uint32_t B = state[1];
uint32_t C = state[2];
uint32_t D = state[3];
uint32_t *W = reinterpret_cast<uint32_t *>(buf + offset);
for (uint32_t i = 0; i < 64; i++) {
uint32_t F, g;
if (i < 16) {
F = (B & C) | ((~B) & D);
g = i;
} else if (i < 32) {
F = (D & B) | ((~D) & C);
g = (5 * i + 1) % 16;
} else if (i < 48) {
F = B ^ C ^ D;
g = (3 * i + 5) % 16;
} else {
F = C ^ (B | (~D));
g = (7 * i) % 16;
}
uint32_t T = D;
D = C;
C = B;
B = B + LEFTROTATE((A + F + partsOfSines[i] + W[g]), shiftAmounts[i]);
A = T;
}
state[0] += A;
state[1] += B;
state[2] += C;
state[3] += D;
}
#undef LEFTROTATE
free(buf);
// Convert digest to string
std::string res;
res.reserve(16 << 1);
const uint8_t *digest = reinterpret_cast<uint8_t *>(state);
char hex[3];
for (size_t i = 0; i < 16; i++) {
snprintf(hex, sizeof(hex), "%02x", digest[i]);
res.append(hex);
}
return res;
}
} // namespace lite
} // namespace paddle
...@@ -67,6 +67,31 @@ static std::string to_string(int index) { ...@@ -67,6 +67,31 @@ static std::string to_string(int index) {
return std::string(buffer); return std::string(buffer);
} }
template <typename T = std::string>
static T parse_string(const std::string& v) {
return v;
}
template <>
int32_t parse_string<int32_t>(const std::string& v) {
return std::stoi(v);
}
template <>
int64_t parse_string<int64_t>(const std::string& v) {
return std::stoll(v);
}
template <>
float parse_string<float>(const std::string& v) {
return std::stof(v);
}
template <>
double parse_string<double>(const std::string& v) {
return std::stod(v);
}
template <typename T> template <typename T>
std::string Join(const std::vector<T>& vec, const std::string& delim) { std::string Join(const std::vector<T>& vec, const std::string& delim) {
if (vec.empty()) return ""; if (vec.empty()) return "";
...@@ -91,19 +116,20 @@ static std::string Repr(const std::vector<std::string>& v) { ...@@ -91,19 +116,20 @@ static std::string Repr(const std::vector<std::string>& v) {
return "{" + Join(tmp, ",") + "}"; return "{" + Join(tmp, ",") + "}";
} }
static std::vector<std::string> Split(const std::string& original, template <class T = std::string>
const std::string& separator) { static std::vector<T> Split(const std::string& original,
std::vector<std::string> results; const std::string& separator) {
std::vector<T> results;
std::string::size_type pos1, pos2; std::string::size_type pos1, pos2;
pos2 = original.find(separator); pos2 = original.find(separator);
pos1 = 0; pos1 = 0;
while (std::string::npos != pos2) { while (std::string::npos != pos2) {
results.push_back(original.substr(pos1, pos2 - pos1)); results.push_back(parse_string<T>(original.substr(pos1, pos2 - pos1)));
pos1 = pos2 + separator.size(); pos1 = pos2 + separator.size();
pos2 = original.find(separator, pos1); pos2 = original.find(separator, pos1);
} }
if (pos1 != original.length()) { if (pos1 != original.length()) {
results.push_back(original.substr(pos1)); results.push_back(parse_string<T>(original.substr(pos1)));
} }
return results; return results;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册