From 0c25428c89a77251846feaaa204e897b9ab3d342 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 29 Aug 2019 12:01:15 +0800 Subject: [PATCH] [NPU] refine npu subgraph and clean code (#1902) * add npu script and tester * fix npu armv7 so and refine tests test=develop * update fix and refine log test=develop * refine npu generate api * refine npu subgraph * refine npu gen and clean code * fix model laod * refine node2rm in subgraph * refine the build npu functions test=develop --- lite/api/cxx_api.cc | 6 - lite/api/cxx_api.h | 2 - lite/api/mobilenetv1_int8_test.cc | 7 +- lite/api/mobilenetv1_test.cc | 7 - lite/api/mobilenetv2_test.cc | 7 - lite/core/device_info.cc | 2 +- .../mir/subgraph/generate_npu_program_pass.cc | 258 ++++++++++-------- .../mir/subgraph/generate_npu_program_pass.h | 46 ++-- .../generate_npu_program_pass_test.cc | 14 +- .../mir/subgraph/subgraph_program_pass.cc | 102 ++++++- .../core/mir/subgraph/subgraph_program_pass.h | 25 ++ lite/core/optimizer.h | 40 +-- lite/model_parser/model_parser.cc | 1 + lite/npu/bridge/utils.cc | 42 --- lite/npu/bridge/utils.h | 5 - lite/tools/build_npu.sh | 1 + 16 files changed, 323 insertions(+), 242 deletions(-) diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 6ad6e29104..5f160b6f79 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -135,12 +135,6 @@ void Predictor::GenRuntimeProgram() { program_generated_ = true; } -void Predictor::GenNPURuntimeProgram() { - program_ = optimizer_.GenNPURuntimeProgram(); - CHECK_EQ(exec_scope_, program_->exec_scope()); - program_generated_ = true; -} - const lite::Tensor *Predictor::GetTensor(const std::string &name) const { auto *var = exec_scope_->FindVar(name); return &var->Get(); diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index b3f0472d0b..c12f6996f4 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -55,8 +55,6 @@ class LITE_API Predictor { void GenRuntimeProgram(); - void GenNPURuntimeProgram(); - // Run the predictor for a single batch of data. void Run() { if (!program_generated_) { diff --git a/lite/api/mobilenetv1_int8_test.cc b/lite/api/mobilenetv1_int8_test.cc index 1f93c1e6b7..769f195d19 100644 --- a/lite/api/mobilenetv1_int8_test.cc +++ b/lite/api/mobilenetv1_int8_test.cc @@ -26,8 +26,7 @@ namespace paddle { namespace lite { void TestModel(const std::vector& valid_places, - const Place& preferred_place, - bool use_npu = false) { + const Place& preferred_place) { DeviceInfo::Init(); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); lite::Predictor predictor; @@ -42,10 +41,6 @@ void TestModel(const std::vector& valid_places, data[i] = 1; } - if (use_npu) { - predictor.GenNPURuntimeProgram(); - } - for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); } diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index b87f9a23f4..91d1828a94 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -30,7 +30,6 @@ namespace lite { void TestModel(const std::vector& valid_places, const Place& preferred_place, const std::string& model_dir = FLAGS_model_dir, - bool gen_npu = false, bool save_model = false) { DeviceInfo::Init(); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); @@ -46,10 +45,6 @@ void TestModel(const std::vector& valid_places, data[i] = 1; } - if (gen_npu) { - predictor.GenNPURuntimeProgram(); - } - for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); } @@ -116,13 +111,11 @@ TEST(MobileNetV1, test_npu) { TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}), FLAGS_model_dir, - true /* gen_npu */, true /* save_model*/); TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}), FLAGS_optimized_model, - false /* gen_npu */, false /* save model */); } #endif // LITE_WITH_NPU diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc index f27197c5df..ca36943cb9 100644 --- a/lite/api/mobilenetv2_test.cc +++ b/lite/api/mobilenetv2_test.cc @@ -31,7 +31,6 @@ namespace lite { void TestModel(const std::vector& valid_places, const Place& preferred_place, const std::string& model_dir = FLAGS_model_dir, - bool gen_npu = false, bool save_model = false) { DeviceInfo::Init(); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); @@ -47,10 +46,6 @@ void TestModel(const std::vector& valid_places, data[i] = 1; } - if (gen_npu) { - predictor.GenNPURuntimeProgram(); - } - for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); } @@ -116,13 +111,11 @@ TEST(MobileNetV2, test_npu) { TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}), FLAGS_model_dir, - true /* gen_npu */, true /* save_model*/); TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}), FLAGS_optimized_model, - false /* gen_npu */, false /* save model */); } #endif // LITE_WITH_NPU diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 7b33f7cc6d..432f968945 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -958,7 +958,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) { int DeviceInfo::Setup() { core_num_ = get_cpu_num(); - printf("core number: %d\n", core_num_); + LOG(INFO) << " CPU core number: " << core_num_; mem_size_ = get_mem_size(); get_cpu_arch(&archs_, core_num_); // set defalut CPU info diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc index 6e54bd0785..5c04e1651f 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc @@ -37,127 +37,92 @@ namespace lite { namespace mir { namespace subgraph { -void GenerateNPUProgramPass::SubgraphSortHelper( - Node* node, - const std::unordered_set& nodes_all, - std::unordered_set* visited_nodes, - std::vector* ret) { - for (auto& var_node : node->inlinks) { - if (var_node->inlinks.empty()) continue; - auto* op_node = var_node->inlinks.front(); - if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) { - SubgraphSortHelper(op_node, nodes_all, visited_nodes, ret); - } +std::shared_ptr GenerateNPUProgramPass::CvtVarNode( + lite::mir::Node* var_node, const Scope* scope) { + CHECK(var_node->IsArg()); + const auto& arg = var_node->AsArg(); + VLOG(4) << "Convert var node " << arg.name; + + auto* var = scope->FindVar(arg.name); + CHECK(var); + auto* tensor = var->GetMutable(); + CHECK(tensor); + auto dims = tensor->dims(); + if (arg.is_weight) { + auto wgt = std::make_shared(arg.name); + LOG(INFO) << "in convert const:" << arg.name; + VLOG(4) << dims; + wgt->set_attr_value(lite::npu::bridge::CvtFromLiteTensor(tensor)); + return wgt; + } else { + CHECK_EQ(dims.size(), 4); + LOG(INFO) << "in convert data:" << arg.name; + LOG(INFO) << dims; + // TODO(xxx): support more types and dims size + ge::TensorDesc desc(ge::Shape(dims.Vectorize()), + ge::Format::FORMAT_NCHW, + ge::DataType::DT_FLOAT); + + // auto size = desc.GetShape().GetShapeSize(); + // ge::TensorUtils::SetSize(desc, size*sizeof(float)); + // ge::TensorUtils::SetRealDimCnt(desc, 4); + auto data = std::make_shared(arg.name); + data->update_input_desc_x(desc); + return data; } - ret->push_back(node); - visited_nodes->insert(node); + return nullptr; } -void GenerateNPUProgramPass::CvtOpNodes( +void GenerateNPUProgramPass::CvtAllOpNodes( const std::vector& nodes2cvt, - lite::npu::bridge::node_map_type* cvted_vars) { + lite::npu::bridge::node_map_type* converted_vars) { const auto& bridges = lite::npu::bridge::Factory::Instance(); const auto& cvtfunc_map = bridges.AllFunctions(); - // record all converted vars - // op node's inputs must be found in cvted_vars + // return record all converted vars + // op node's inputs must be found in converted_vars for (auto& node : nodes2cvt) { lite::npu::bridge::node_map_type node_inputs; auto& stmt = node->AsStmt(); for (auto& var_node : node->inlinks) { auto& arg = var_node->AsArg(); - if (arg.is_weight) continue; - auto var_name = arg.name; - if (!cvted_vars->count(var_name)) { - cvted_vars->insert(std::make_pair( - var_name, - lite::npu::bridge::CvtNode(var_node, stmt.op()->scope()))); - } - node_inputs.insert(*cvted_vars->find(var_name)); - } - auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs); - cvted_vars->insert(node_outputs.begin(), node_outputs.end()); - } -} - -void GenerateNPUProgramPass::GetIOVars( - const std::vector& nodes2cvt, - const lite::npu::bridge::node_map_type& cvted_vars, - std::unordered_set* nodes2rm, - std::vector* in_vars, - std::vector* out_vars, - lite::npu::bridge::node_map_type* in_cvted_vars, - lite::npu::bridge::node_map_type* out_cvted_vars) { - std::unordered_set op_nodes_all(nodes2cvt.begin(), nodes2cvt.end()); - for (auto& op_node : nodes2cvt) { - for (auto& in_var : op_node->inlinks) { - if (in_var->AsArg().is_weight) continue; - auto* pre_op_node = in_var->inlinks.front(); - if (op_nodes_all.count(pre_op_node)) { - nodes2rm->insert(in_var); - continue; - } - in_vars->push_back(in_var); - auto arg_name = in_var->AsArg().name; - in_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name))); - } - for (auto& out_var : op_node->outlinks) { - if (out_var->outlinks.empty()) { - nodes2rm->insert(out_var); + // weight should be handled in the converter, so skip here + if (arg.is_weight) { continue; } - auto* next_op_node = out_var->outlinks.front(); - - if (op_nodes_all.count(next_op_node)) { - nodes2rm->insert(out_var); - continue; + auto var_name = arg.name; + if (!converted_vars->count(var_name)) { + converted_vars->insert( + std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope()))); } - out_vars->push_back(out_var); - auto arg_name = out_var->AsArg().name; - out_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name))); + node_inputs.insert(*converted_vars->find(var_name)); } + auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs); + converted_vars->insert(node_outputs.begin(), node_outputs.end()); } - nodes2rm->insert(nodes2cvt.begin(), nodes2cvt.end()); } -void GenerateNPUProgramPass::GenNPUGraphOpNode( - const std::unique_ptr& graph, - int sub_id, - const std::unordered_set& nodes_all) { - std::unordered_set visited_nodes; - std::vector ret; - for (auto& node : nodes_all) { - if (!node->IsStmt()) continue; - if (visited_nodes.count(node)) continue; - SubgraphSortHelper(node, nodes_all, &visited_nodes, &ret); - } +std::string GenerateNPUProgramPass::BuildNPUGraph( + const std::unordered_set& op_nodes, + const std::unordered_set& in_data_vars, + const std::unordered_set& out_data_vars, + int sub_id) { + auto ordered_nodes = GetTopologicalOrder(op_nodes); + lite::npu::bridge::node_map_type converted_vars; + CvtAllOpNodes(ordered_nodes, &converted_vars); - lite::npu::bridge::node_map_type cvted_vars; - CvtOpNodes(ret, &cvted_vars); - - std::unordered_set nodes2rm; - std::vector in_vars; - std::vector out_vars; - lite::npu::bridge::node_map_type in_cvted_vars; - lite::npu::bridge::node_map_type out_cvted_vars; - GetIOVars(ret, - cvted_vars, - &nodes2rm, - &in_vars, - &out_vars, - &in_cvted_vars, - &out_cvted_vars); - - std::vector in_vars_name; - std::vector out_vars_name; + std::vector in_var_names; + std::vector out_var_names; std::vector inputs; std::vector outputs; - for (auto i : in_cvted_vars) { - in_vars_name.push_back(i.first); - inputs.push_back(*i.second); + for (auto i : in_data_vars) { + auto argname = i->AsArg().name; + in_var_names.push_back(argname); + inputs.push_back(*converted_vars.at(argname)); } - for (auto i : out_cvted_vars) { - out_vars_name.push_back(i.first); - outputs.push_back(*i.second); + for (auto i : out_data_vars) { + auto argname = i->AsArg().name; + out_var_names.push_back(argname); + outputs.push_back(*converted_vars.at(argname)); } std::string model_name("hiai_npu_client_" + std::to_string(sub_id) + ".om"); @@ -165,27 +130,55 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode( LOG(FATAL) << "Build NPU failed subgraph " << sub_id; } LOG(INFO) << "[NPU] Build NPU Client success subgraph " << sub_id; + return model_name; +} +cpp::OpDesc GenerateNPUProgramPass::GenGraphOpDesc( + const std::string& model_name, + const std::vector& in_var_names, + const std::vector& out_var_names) { cpp::OpDesc op_desc; op_desc.SetType("graph_op"); + op_desc.SetInput("Inputs", in_var_names); + op_desc.SetOutput("Outputs", out_var_names); + op_desc.SetAttr("model_name", model_name); + return op_desc; +} + +void GenerateNPUProgramPass::InsertNewNode( + const std::unique_ptr& graph, + const std::string& model_name, + Scope* scope, + const std::vector& valid_places, + std::unordered_set in_data_vars, + std::unordered_set in_wgt_vars, + std::unordered_set out_data_vars, + std::unordered_set out_unused_vars) { std::vector in_var_names; + std::vector out_var_names; + for (auto i : in_data_vars) { + in_var_names.push_back(i->AsArg().name); + } + for (auto i : out_data_vars) { + out_var_names.push_back(i->AsArg().name); + } - op_desc.SetInput("Inputs", in_vars_name); - op_desc.SetOutput("Outputs", out_vars_name); - op_desc.SetAttr("model_name", model_name); - auto graph_op = LiteOpRegistry::Global().Create("graph_op"); + auto op_desc = GenGraphOpDesc(model_name, in_var_names, out_var_names); - auto any_op = ret.front()->AsStmt().op(); - auto* scope = any_op->scope(); + auto graph_op = LiteOpRegistry::Global().Create("graph_op"); graph_op->Attach(op_desc, scope); - - auto valid_places = any_op->valid_places(); auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places); - for (auto& in_var : in_vars) { + for (auto& in_var : in_data_vars) { IR_NODE_LINK_TO(in_var, new_op_node); } - for (auto& out_var : out_vars) { + for (auto& in_var : in_wgt_vars) { + IR_NODE_LINK_TO(in_var, new_op_node); + } + for (auto& out_var : out_data_vars) { + IR_OP_VAR_LINK(new_op_node, out_var); + } + for (auto& out_var : out_unused_vars) { IR_OP_VAR_LINK(new_op_node, out_var); } @@ -193,6 +186,34 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode( auto& inst = new_op_node->AsStmt(); inst.picked_kernel().SetContext( ContextScheduler::Global().NewContext(inst.picked_kernel().target())); +} + +void GenerateNPUProgramPass::GenNPUGraphOpNode( + const std::unique_ptr& graph, + const std::unordered_set& op_nodes, + int sub_id) { + std::unordered_set in_data_vars; + std::unordered_set in_wgt_vars; + std::unordered_set out_data_vars; + std::unordered_set out_unused_vars; + FindInputOutputVars( + op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars); + + auto nodes2rm = GetNode2rm( + op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars}); + + auto model_name = + BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id); + + auto any_op = (*op_nodes.begin())->AsStmt().op(); + InsertNewNode(graph, + model_name, + any_op->scope(), + any_op->valid_places(), + in_data_vars, + in_wgt_vars, + out_data_vars, + out_unused_vars); GraphSafeRemoveNodes(graph.get(), nodes2rm); } @@ -215,7 +236,7 @@ void GenerateNPUProgramPass::ConvertSubgraph( for (int id = 1; id <= sub_num; ++id) { LOG(INFO) << "Converting subgraph_id:" << id; - GenNPUGraphOpNode(graph, id, nodes_all.at(id)); + GenNPUGraphOpNode(graph, nodes_all.at(id), id); } } @@ -226,14 +247,21 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr& graph) { const auto& op_map = bridges.AllFunctions(); std::vector supported_op_types; for (auto& i : op_map) { - LOG(INFO) << i.first; + LOG(INFO) << "Supported type: " << i.first; supported_op_types.push_back(i.first); } - int num_subgraph = FuseSubgraph(graph, supported_op_types); - LOG(INFO) << "detected " << num_subgraph << " NPU subgraph"; - InferOnce(graph); - ConvertSubgraph(graph, num_subgraph); + try { + int num_subgraph = FuseSubgraph(graph, supported_op_types); + LOG(INFO) << "detected " << num_subgraph << " NPU subgraph"; + + InferOnce(graph); + ConvertSubgraph(graph, num_subgraph); + } catch (...) { + // exception = true; + LOG(WARNING) << "Build NPU graph failed"; + } + LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get()); for (auto& item : graph->StmtTopologicalOrder()) { diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h index 151138476e..d8bb45927c 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.h +++ b/lite/core/mir/subgraph/generate_npu_program_pass.h @@ -38,31 +38,35 @@ class GenerateNPUProgramPass : public SubgraphProgramPass { std::unique_ptr GenProgram(); protected: - // sort nodes to operational sequence - void SubgraphSortHelper(Node* node, - const std::unordered_set& nodes_all, - std::unordered_set* visited_nodes, - std::vector* ret); - // nodes2cvt: op nodes to convert - // cvted_vars: converted var nodes - // nodes2rm: op nodes and var nodes that need to be removed - void CvtOpNodes(const std::vector& nodes2cvt, - lite::npu::bridge::node_map_type* cvted_vars); + // return cvted_vars: converted var nodes + void CvtAllOpNodes(const std::vector& nodes2cvt, + lite::npu::bridge::node_map_type* cvted_vars); + + std::shared_ptr CvtVarNode(lite::mir::Node* var_node, + const Scope* scope); + + std::string BuildNPUGraph(const std::unordered_set& op_nodes, + const std::unordered_set& in_data_vars, + const std::unordered_set& out_data_vars, + int sub_id); + + cpp::OpDesc GenGraphOpDesc(const std::string& model_name, + const std::vector& in_var_names, + const std::vector& out_var_names); - // achieve input and output vars/cvted_vars; - // achieve all nodes to remove - void GetIOVars(const std::vector& nodes2cvt, - const lite::npu::bridge::node_map_type& cvted_vars, - std::unordered_set* nodes2rm, - std::vector* in_vars, - std::vector* out_vars, - lite::npu::bridge::node_map_type* in_cvted_vars, - lite::npu::bridge::node_map_type* out_cvted_vars); + void InsertNewNode(const std::unique_ptr& graph, + const std::string& model_name, + Scope* scope, + const std::vector& valid_places, + std::unordered_set in_data_vars, + std::unordered_set in_wgt_vars, + std::unordered_set out_data_vars, + std::unordered_set out_unused_vars); void GenNPUGraphOpNode(const std::unique_ptr& graph, - int sub_id, - const std::unordered_set& nodes_all); + const std::unordered_set& nodes_all, + int sub_id); void ConvertSubgraph(const std::unique_ptr& graph, int sub_num); diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc index d29ce5dee5..8bfdb7381b 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc @@ -39,8 +39,11 @@ namespace lite { void TestModel(lite::Predictor* predictor, const std::vector& valid_places, const std::string& model_dir) { - predictor->Build( - model_dir, "", "", Place{TARGET(kARM), PRECISION(kFloat)}, valid_places); + predictor->Build(model_dir, + model_dir + "/model", + model_dir + "/params", + Place{TARGET(kARM), PRECISION(kFloat)}, + valid_places); auto* input_tensor = predictor->GetInput(0); input_tensor->Resize(DDim(std::vector( @@ -51,13 +54,6 @@ void TestModel(lite::Predictor* predictor, data[i] = 1; } - if (std::find(valid_places.begin(), - valid_places.end(), - Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) { - // TODO(TJ): change if valid npu so try use it, add rollback and move to api - predictor->GenNPURuntimeProgram(); - } - predictor->Run(); if (model_dir != FLAGS_optimized_model && std::find(valid_places.begin(), diff --git a/lite/core/mir/subgraph/subgraph_program_pass.cc b/lite/core/mir/subgraph/subgraph_program_pass.cc index 91edadf895..3947d3b582 100644 --- a/lite/core/mir/subgraph/subgraph_program_pass.cc +++ b/lite/core/mir/subgraph/subgraph_program_pass.cc @@ -26,6 +26,105 @@ namespace lite { namespace mir { namespace subgraph { +void SubgraphProgramPass::SortHelper( + Node* node, + const std::unordered_set& nodes_all, + std::unordered_set* visited_nodes, + std::vector* ret) { + for (auto& var_node : node->inlinks) { + if (var_node->inlinks.empty()) continue; + auto* op_node = var_node->inlinks.front(); + if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) { + SortHelper(op_node, nodes_all, visited_nodes, ret); + } + } + ret->push_back(node); + visited_nodes->insert(node); +} + +std::vector SubgraphProgramPass::GetTopologicalOrder( + const std::unordered_set& nodes) { + std::unordered_set visited; + std::vector ret; + for (auto& node : nodes) { + if (!node->IsStmt()) continue; + if (visited.count(node)) continue; + SortHelper(node, nodes, &visited, &ret); + } + return ret; +} + +void SubgraphProgramPass::FindInputOutputVars( + const std::unordered_set& op_nodes, + std::unordered_set* in_data_vars, + std::unordered_set* in_wgt_vars, + std::unordered_set* out_data_vars, + std::unordered_set* out_unused_vars) { + for (auto& op_node : op_nodes) { + for (auto& in_var : op_node->inlinks) { + if (in_var->AsArg().is_weight) { + in_wgt_vars->insert(in_var); + continue; + } + if (!in_var->inlinks.empty()) { + // var can only come from one op node, so use front + auto* pre_op_node = in_var->inlinks.front(); + if (op_nodes.count(pre_op_node)) { + continue; + } + } + in_data_vars->insert(in_var); + } + for (auto& out_var : op_node->outlinks) { + if (out_var->outlinks.empty()) { + // the next op is empty so this var is actually unused + out_unused_vars->insert(out_var); + continue; + } + // var can have more than one next op node + // so, if any one in the op_nodes then continue + bool next_op_in_nodes = false; + for (auto& next_op_node : out_var->outlinks) { + if (op_nodes.count(next_op_node)) { + next_op_in_nodes = true; + } + } + if (next_op_in_nodes) { + continue; + } + + out_data_vars->insert(out_var); + } + } +} + +std::unordered_set SubgraphProgramPass::GetNode2rm( + const std::unordered_set& op_nodes, + const std::vector>& excluded_nodes) { + std::unordered_set nodes2rm(op_nodes.begin(), op_nodes.end()); + for (auto& op_node : op_nodes) { + for (auto& in_var : op_node->inlinks) { + if (!nodes2rm.count(in_var)) { + nodes2rm.insert(in_var); + } + } + for (auto& out_var : op_node->outlinks) { + if (!nodes2rm.count(out_var)) { + nodes2rm.insert(out_var); + } + } + } + // some nodes should not be removed + for (auto& e : excluded_nodes) { + for (auto& i : e) { + if (nodes2rm.count(i)) { + nodes2rm.erase(i); + } + } + } + return nodes2rm; +} + void SubgraphProgramPass::InferOnce(const std::unique_ptr& graph) { for (auto& item : graph->StmtTopologicalOrder()) { if (!item->IsStmt()) continue; @@ -127,13 +226,10 @@ int SubgraphProgramPass::FuseSubgraphID( for (auto& j : i->outlinks) { if (j->IsStmt()) { auto& jstmt = j->AsStmt(); - // LOG(INFO) << "initial: "<outlinks) i_nodes_[sub_id].insert(i); } diff --git a/lite/core/mir/subgraph/subgraph_program_pass.h b/lite/core/mir/subgraph/subgraph_program_pass.h index e80f87333e..5bf477544d 100644 --- a/lite/core/mir/subgraph/subgraph_program_pass.h +++ b/lite/core/mir/subgraph/subgraph_program_pass.h @@ -54,7 +54,32 @@ class SubgraphProgramPass : public ProgramPass { // std::unique_ptr& graph, int sub_num); void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0); + // Below function cloud be useful in child classes // + + // Sort and return the topology order of nodes set + std::vector GetTopologicalOrder( + const std::unordered_set& nodes); + + // find all input data vars, input weight vars, + // output data vars and output vars from the nodes + void FindInputOutputVars(const std::unordered_set& op_nodes, + std::unordered_set* in_data_vars, + std::unordered_set* in_wgt_vars, + std::unordered_set* out_data_vars, + std::unordered_set* out_unused_vars); + + // return the node to remove in the subgraph + std::unordered_set GetNode2rm( + const std::unordered_set& op_nodes, + const std::vector>& excluded_nodes); + private: + // sort nodes to operational sequence + void SortHelper(Node* node, + const std::unordered_set& nodes_all, + std::unordered_set* visited_nodes, + std::vector* ret); + // {1: {nodes2rm_in_subgraph1, ...}, // 2: {nodes2rm_in_subgraph2, ...}} // delete nodes diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index fcc3470525..4670458952 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -109,6 +109,28 @@ class Optimizer { // Generate a new program based on the mir graph. std::unique_ptr GenRuntimeProgram() { +#ifdef LITE_WITH_NPU + if (std::find(valid_places_.begin(), + valid_places_.end(), + Place{TARGET(kNPU), PRECISION(kFloat)}) != + valid_places_.end()) { + CheckInputDimsNotEmpty(exec_scope_); + auto pass = mir::PassManager::Global() + .LookUp( + "generate_npu_program_pass"); + pass->Apply(graph_); + + auto program = pass->GenProgram(); + if (program) { + CHECK(exec_scope_); + program->set_exec_scope(exec_scope_); + return program; + } else { + LOG(WARNING) << "Build NPU graph failed."; + } + } + +#endif auto pass = mir::PassManager::Global().LookUp( "generate_program_pass"); pass->Apply(graph_); @@ -131,24 +153,6 @@ class Optimizer { } } - std::unique_ptr GenNPURuntimeProgram() { -#ifdef LITE_WITH_NPU - CheckInputDimsNotEmpty(exec_scope_); - auto pass = mir::PassManager::Global() - .LookUp( - "generate_npu_program_pass"); - pass->Apply(graph_); - - auto program = pass->GenProgram(); - CHECK(exec_scope_); - program->set_exec_scope(exec_scope_); - return program; -#else - LOG(WARNING) << "Not compiled with NPU but use it!"; - return GenRuntimeProgram(); -#endif - } - void InitTargetTypeTransformPass() { auto* pass = mir::PassManager::Global().LookUp( diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc index 7393db08ab..ed8f5a96f0 100644 --- a/lite/model_parser/model_parser.cc +++ b/lite/model_parser/model_parser.cc @@ -247,6 +247,7 @@ void LoadModelPb(const std::string &model_dir, } #ifdef LITE_WITH_NPU + auto main_block = pb_proto_prog.blocks(0); for (auto &op : main_block.ops()) { LOG(INFO) << "op type:" << op.type(); if (op.type() != "graph_op") { diff --git a/lite/npu/bridge/utils.cc b/lite/npu/bridge/utils.cc index 7837fdccb1..3f42395488 100644 --- a/lite/npu/bridge/utils.cc +++ b/lite/npu/bridge/utils.cc @@ -113,48 +113,6 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor, return out_tensor; } -std::shared_ptr CvtNode(lite::mir::Node* var_node, - const Scope* scope) { - CHECK(var_node->IsArg()); - const auto& arg = var_node->AsArg(); - VLOG(4) << "Convert var node " << arg.name; - - auto* var = scope->FindVar(arg.name); - CHECK(var); - auto* tensor = var->GetMutable(); - CHECK(tensor); - auto dims = tensor->dims(); - if (arg.is_weight) { - auto wgt = std::make_shared(arg.name); - LOG(INFO) << "in convert const:" << arg.name; - LOG(INFO) << dims; - wgt->set_attr_value(CvtFromLiteTensor(tensor)); - - auto odesc = wgt->GetOutputDesc(0); - LOG(INFO) << "const ----"; - for (auto i : odesc.GetShape().GetDims()) { - LOG(INFO) << ";;;;;;;;;------: " << i; - } - return wgt; - } else { - CHECK_EQ(dims.size(), 4); - LOG(INFO) << "in convert data:" << arg.name; - LOG(INFO) << dims; - // TODO(TJ): support more types and dims size - ge::TensorDesc desc(ge::Shape(dims.Vectorize()), - ge::Format::FORMAT_NCHW, - ge::DataType::DT_FLOAT); - - // auto size = desc.GetShape().GetShapeSize(); - // ge::TensorUtils::SetSize(desc, size*sizeof(float)); - // ge::TensorUtils::SetRealDimCnt(desc, 4); - auto data = std::make_shared(arg.name); - data->update_input_desc_x(desc); - return data; - } - return nullptr; -} - bool HasInputArg(const OpInfo* op_info, const Scope* scope, const std::string& argname) { diff --git a/lite/npu/bridge/utils.h b/lite/npu/bridge/utils.h index 2bccbccb07..169b7ca80c 100644 --- a/lite/npu/bridge/utils.h +++ b/lite/npu/bridge/utils.h @@ -84,11 +84,6 @@ ge::TensorPtr CreateTensorAndFillData(T value, return CreateTensorAndFillData(data, shape, format); } -std::shared_ptr CvtNode2Tensor(const lite::mir::Node* arg_node); - -std::shared_ptr CvtNode(lite::mir::Node* var_node, - const Scope* scope); - bool HasInputArg(const OpInfo* op_info, const Scope* scope, const std::string& argname); diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh index 600569fdd7..eb96508283 100755 --- a/lite/tools/build_npu.sh +++ b/lite/tools/build_npu.sh @@ -62,6 +62,7 @@ function cmake_npu { -DWITH_LITE=ON \ -DLITE_WITH_CUDA=OFF \ -DLITE_WITH_X86=OFF \ + -DLITE_BUILD_EXTRA=ON \ -DLITE_WITH_ARM=ON \ -DWITH_ARM_DOTPROD=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -- GitLab