未验证 提交 0c25428c 编写于 作者: T tensor-tang 提交者: GitHub

[NPU] refine npu subgraph and clean code (#1902)

* add npu script and tester

* fix npu armv7 so and refine tests

test=develop

* update fix and refine log

test=develop

* refine npu generate api

* refine npu subgraph

* refine npu gen and clean code

* fix model laod

* refine node2rm in subgraph

* refine the build npu functions

test=develop
上级 0cfbd266
...@@ -135,12 +135,6 @@ void Predictor::GenRuntimeProgram() { ...@@ -135,12 +135,6 @@ void Predictor::GenRuntimeProgram() {
program_generated_ = true; program_generated_ = true;
} }
void Predictor::GenNPURuntimeProgram() {
program_ = optimizer_.GenNPURuntimeProgram();
CHECK_EQ(exec_scope_, program_->exec_scope());
program_generated_ = true;
}
const lite::Tensor *Predictor::GetTensor(const std::string &name) const { const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name); auto *var = exec_scope_->FindVar(name);
return &var->Get<lite::Tensor>(); return &var->Get<lite::Tensor>();
......
...@@ -55,8 +55,6 @@ class LITE_API Predictor { ...@@ -55,8 +55,6 @@ class LITE_API Predictor {
void GenRuntimeProgram(); void GenRuntimeProgram();
void GenNPURuntimeProgram();
// Run the predictor for a single batch of data. // Run the predictor for a single batch of data.
void Run() { void Run() {
if (!program_generated_) { if (!program_generated_) {
......
...@@ -26,8 +26,7 @@ namespace paddle { ...@@ -26,8 +26,7 @@ namespace paddle {
namespace lite { namespace lite {
void TestModel(const std::vector<Place>& valid_places, void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place, const Place& preferred_place) {
bool use_npu = false) {
DeviceInfo::Init(); DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor; lite::Predictor predictor;
...@@ -42,10 +41,6 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -42,10 +41,6 @@ void TestModel(const std::vector<Place>& valid_places,
data[i] = 1; data[i] = 1;
} }
if (use_npu) {
predictor.GenNPURuntimeProgram();
}
for (int i = 0; i < FLAGS_warmup; ++i) { for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run(); predictor.Run();
} }
......
...@@ -30,7 +30,6 @@ namespace lite { ...@@ -30,7 +30,6 @@ namespace lite {
void TestModel(const std::vector<Place>& valid_places, void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place, const Place& preferred_place,
const std::string& model_dir = FLAGS_model_dir, const std::string& model_dir = FLAGS_model_dir,
bool gen_npu = false,
bool save_model = false) { bool save_model = false) {
DeviceInfo::Init(); DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
...@@ -46,10 +45,6 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -46,10 +45,6 @@ void TestModel(const std::vector<Place>& valid_places,
data[i] = 1; data[i] = 1;
} }
if (gen_npu) {
predictor.GenNPURuntimeProgram();
}
for (int i = 0; i < FLAGS_warmup; ++i) { for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run(); predictor.Run();
} }
...@@ -116,13 +111,11 @@ TEST(MobileNetV1, test_npu) { ...@@ -116,13 +111,11 @@ TEST(MobileNetV1, test_npu) {
TestModel(valid_places, TestModel(valid_places,
Place({TARGET(kARM), PRECISION(kFloat)}), Place({TARGET(kARM), PRECISION(kFloat)}),
FLAGS_model_dir, FLAGS_model_dir,
true /* gen_npu */,
true /* save_model*/); true /* save_model*/);
TestModel(valid_places, TestModel(valid_places,
Place({TARGET(kARM), PRECISION(kFloat)}), Place({TARGET(kARM), PRECISION(kFloat)}),
FLAGS_optimized_model, FLAGS_optimized_model,
false /* gen_npu */,
false /* save model */); false /* save model */);
} }
#endif // LITE_WITH_NPU #endif // LITE_WITH_NPU
......
...@@ -31,7 +31,6 @@ namespace lite { ...@@ -31,7 +31,6 @@ namespace lite {
void TestModel(const std::vector<Place>& valid_places, void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place, const Place& preferred_place,
const std::string& model_dir = FLAGS_model_dir, const std::string& model_dir = FLAGS_model_dir,
bool gen_npu = false,
bool save_model = false) { bool save_model = false) {
DeviceInfo::Init(); DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
...@@ -47,10 +46,6 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -47,10 +46,6 @@ void TestModel(const std::vector<Place>& valid_places,
data[i] = 1; data[i] = 1;
} }
if (gen_npu) {
predictor.GenNPURuntimeProgram();
}
for (int i = 0; i < FLAGS_warmup; ++i) { for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run(); predictor.Run();
} }
...@@ -116,13 +111,11 @@ TEST(MobileNetV2, test_npu) { ...@@ -116,13 +111,11 @@ TEST(MobileNetV2, test_npu) {
TestModel(valid_places, TestModel(valid_places,
Place({TARGET(kARM), PRECISION(kFloat)}), Place({TARGET(kARM), PRECISION(kFloat)}),
FLAGS_model_dir, FLAGS_model_dir,
true /* gen_npu */,
true /* save_model*/); true /* save_model*/);
TestModel(valid_places, TestModel(valid_places,
Place({TARGET(kARM), PRECISION(kFloat)}), Place({TARGET(kARM), PRECISION(kFloat)}),
FLAGS_optimized_model, FLAGS_optimized_model,
false /* gen_npu */,
false /* save model */); false /* save model */);
} }
#endif // LITE_WITH_NPU #endif // LITE_WITH_NPU
......
...@@ -958,7 +958,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) { ...@@ -958,7 +958,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {
int DeviceInfo::Setup() { int DeviceInfo::Setup() {
core_num_ = get_cpu_num(); core_num_ = get_cpu_num();
printf("core number: %d\n", core_num_); LOG(INFO) << " CPU core number: " << core_num_;
mem_size_ = get_mem_size(); mem_size_ = get_mem_size();
get_cpu_arch(&archs_, core_num_); get_cpu_arch(&archs_, core_num_);
// set defalut CPU info // set defalut CPU info
......
...@@ -37,127 +37,92 @@ namespace lite { ...@@ -37,127 +37,92 @@ namespace lite {
namespace mir { namespace mir {
namespace subgraph { namespace subgraph {
void GenerateNPUProgramPass::SubgraphSortHelper( std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
Node* node, lite::mir::Node* var_node, const Scope* scope) {
const std::unordered_set<Node*>& nodes_all, CHECK(var_node->IsArg());
std::unordered_set<const Node*>* visited_nodes, const auto& arg = var_node->AsArg();
std::vector<Node*>* ret) { VLOG(4) << "Convert var node " << arg.name;
for (auto& var_node : node->inlinks) {
if (var_node->inlinks.empty()) continue; auto* var = scope->FindVar(arg.name);
auto* op_node = var_node->inlinks.front(); CHECK(var);
if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) { auto* tensor = var->GetMutable<lite::Tensor>();
SubgraphSortHelper(op_node, nodes_all, visited_nodes, ret); CHECK(tensor);
} auto dims = tensor->dims();
if (arg.is_weight) {
auto wgt = std::make_shared<ge::op::Const>(arg.name);
LOG(INFO) << "in convert const:" << arg.name;
VLOG(4) << dims;
wgt->set_attr_value(lite::npu::bridge::CvtFromLiteTensor(tensor));
return wgt;
} else {
CHECK_EQ(dims.size(), 4);
LOG(INFO) << "in convert data:" << arg.name;
LOG(INFO) << dims;
// TODO(xxx): support more types and dims size
ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
ge::Format::FORMAT_NCHW,
ge::DataType::DT_FLOAT);
// auto size = desc.GetShape().GetShapeSize();
// ge::TensorUtils::SetSize(desc, size*sizeof(float));
// ge::TensorUtils::SetRealDimCnt(desc, 4);
auto data = std::make_shared<ge::op::Data>(arg.name);
data->update_input_desc_x(desc);
return data;
} }
ret->push_back(node); return nullptr;
visited_nodes->insert(node);
} }
void GenerateNPUProgramPass::CvtOpNodes( void GenerateNPUProgramPass::CvtAllOpNodes(
const std::vector<Node*>& nodes2cvt, const std::vector<Node*>& nodes2cvt,
lite::npu::bridge::node_map_type* cvted_vars) { lite::npu::bridge::node_map_type* converted_vars) {
const auto& bridges = lite::npu::bridge::Factory::Instance(); const auto& bridges = lite::npu::bridge::Factory::Instance();
const auto& cvtfunc_map = bridges.AllFunctions(); const auto& cvtfunc_map = bridges.AllFunctions();
// record all converted vars // return record all converted vars
// op node's inputs must be found in cvted_vars // op node's inputs must be found in converted_vars
for (auto& node : nodes2cvt) { for (auto& node : nodes2cvt) {
lite::npu::bridge::node_map_type node_inputs; lite::npu::bridge::node_map_type node_inputs;
auto& stmt = node->AsStmt(); auto& stmt = node->AsStmt();
for (auto& var_node : node->inlinks) { for (auto& var_node : node->inlinks) {
auto& arg = var_node->AsArg(); auto& arg = var_node->AsArg();
if (arg.is_weight) continue; // weight should be handled in the converter, so skip here
auto var_name = arg.name; if (arg.is_weight) {
if (!cvted_vars->count(var_name)) {
cvted_vars->insert(std::make_pair(
var_name,
lite::npu::bridge::CvtNode(var_node, stmt.op()->scope())));
}
node_inputs.insert(*cvted_vars->find(var_name));
}
auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
cvted_vars->insert(node_outputs.begin(), node_outputs.end());
}
}
void GenerateNPUProgramPass::GetIOVars(
const std::vector<Node*>& nodes2cvt,
const lite::npu::bridge::node_map_type& cvted_vars,
std::unordered_set<const Node*>* nodes2rm,
std::vector<Node*>* in_vars,
std::vector<Node*>* out_vars,
lite::npu::bridge::node_map_type* in_cvted_vars,
lite::npu::bridge::node_map_type* out_cvted_vars) {
std::unordered_set<Node*> op_nodes_all(nodes2cvt.begin(), nodes2cvt.end());
for (auto& op_node : nodes2cvt) {
for (auto& in_var : op_node->inlinks) {
if (in_var->AsArg().is_weight) continue;
auto* pre_op_node = in_var->inlinks.front();
if (op_nodes_all.count(pre_op_node)) {
nodes2rm->insert(in_var);
continue; continue;
} }
in_vars->push_back(in_var); auto var_name = arg.name;
auto arg_name = in_var->AsArg().name; if (!converted_vars->count(var_name)) {
in_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name))); converted_vars->insert(
} std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope())));
for (auto& out_var : op_node->outlinks) {
if (out_var->outlinks.empty()) {
nodes2rm->insert(out_var);
continue;
}
auto* next_op_node = out_var->outlinks.front();
if (op_nodes_all.count(next_op_node)) {
nodes2rm->insert(out_var);
continue;
} }
out_vars->push_back(out_var); node_inputs.insert(*converted_vars->find(var_name));
auto arg_name = out_var->AsArg().name;
out_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name)));
} }
auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
converted_vars->insert(node_outputs.begin(), node_outputs.end());
} }
nodes2rm->insert(nodes2cvt.begin(), nodes2cvt.end());
} }
void GenerateNPUProgramPass::GenNPUGraphOpNode( std::string GenerateNPUProgramPass::BuildNPUGraph(
const std::unique_ptr<SSAGraph>& graph, const std::unordered_set<Node*>& op_nodes,
int sub_id, const std::unordered_set<Node*>& in_data_vars,
const std::unordered_set<Node*>& nodes_all) { const std::unordered_set<Node*>& out_data_vars,
std::unordered_set<const Node*> visited_nodes; int sub_id) {
std::vector<Node*> ret; auto ordered_nodes = GetTopologicalOrder(op_nodes);
for (auto& node : nodes_all) { lite::npu::bridge::node_map_type converted_vars;
if (!node->IsStmt()) continue; CvtAllOpNodes(ordered_nodes, &converted_vars);
if (visited_nodes.count(node)) continue;
SubgraphSortHelper(node, nodes_all, &visited_nodes, &ret);
}
lite::npu::bridge::node_map_type cvted_vars; std::vector<std::string> in_var_names;
CvtOpNodes(ret, &cvted_vars); std::vector<std::string> out_var_names;
std::unordered_set<const Node*> nodes2rm;
std::vector<Node*> in_vars;
std::vector<Node*> out_vars;
lite::npu::bridge::node_map_type in_cvted_vars;
lite::npu::bridge::node_map_type out_cvted_vars;
GetIOVars(ret,
cvted_vars,
&nodes2rm,
&in_vars,
&out_vars,
&in_cvted_vars,
&out_cvted_vars);
std::vector<std::string> in_vars_name;
std::vector<std::string> out_vars_name;
std::vector<ge::Operator> inputs; std::vector<ge::Operator> inputs;
std::vector<ge::Operator> outputs; std::vector<ge::Operator> outputs;
for (auto i : in_cvted_vars) { for (auto i : in_data_vars) {
in_vars_name.push_back(i.first); auto argname = i->AsArg().name;
inputs.push_back(*i.second); in_var_names.push_back(argname);
inputs.push_back(*converted_vars.at(argname));
} }
for (auto i : out_cvted_vars) { for (auto i : out_data_vars) {
out_vars_name.push_back(i.first); auto argname = i->AsArg().name;
outputs.push_back(*i.second); out_var_names.push_back(argname);
outputs.push_back(*converted_vars.at(argname));
} }
std::string model_name("hiai_npu_client_" + std::to_string(sub_id) + ".om"); std::string model_name("hiai_npu_client_" + std::to_string(sub_id) + ".om");
...@@ -165,27 +130,55 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode( ...@@ -165,27 +130,55 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(
LOG(FATAL) << "Build NPU failed subgraph " << sub_id; LOG(FATAL) << "Build NPU failed subgraph " << sub_id;
} }
LOG(INFO) << "[NPU] Build NPU Client success subgraph " << sub_id; LOG(INFO) << "[NPU] Build NPU Client success subgraph " << sub_id;
return model_name;
}
cpp::OpDesc GenerateNPUProgramPass::GenGraphOpDesc(
const std::string& model_name,
const std::vector<std::string>& in_var_names,
const std::vector<std::string>& out_var_names) {
cpp::OpDesc op_desc; cpp::OpDesc op_desc;
op_desc.SetType("graph_op"); op_desc.SetType("graph_op");
op_desc.SetInput("Inputs", in_var_names);
op_desc.SetOutput("Outputs", out_var_names);
op_desc.SetAttr("model_name", model_name);
return op_desc;
}
void GenerateNPUProgramPass::InsertNewNode(
const std::unique_ptr<SSAGraph>& graph,
const std::string& model_name,
Scope* scope,
const std::vector<Place>& valid_places,
std::unordered_set<Node*> in_data_vars,
std::unordered_set<Node*> in_wgt_vars,
std::unordered_set<Node*> out_data_vars,
std::unordered_set<Node*> out_unused_vars) {
std::vector<std::string> in_var_names; std::vector<std::string> in_var_names;
std::vector<std::string> out_var_names;
for (auto i : in_data_vars) {
in_var_names.push_back(i->AsArg().name);
}
for (auto i : out_data_vars) {
out_var_names.push_back(i->AsArg().name);
}
op_desc.SetInput("Inputs", in_vars_name); auto op_desc = GenGraphOpDesc(model_name, in_var_names, out_var_names);
op_desc.SetOutput("Outputs", out_vars_name);
op_desc.SetAttr("model_name", model_name);
auto graph_op = LiteOpRegistry::Global().Create("graph_op");
auto any_op = ret.front()->AsStmt().op(); auto graph_op = LiteOpRegistry::Global().Create("graph_op");
auto* scope = any_op->scope();
graph_op->Attach(op_desc, scope); graph_op->Attach(op_desc, scope);
auto valid_places = any_op->valid_places();
auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places); auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);
for (auto& in_var : in_vars) { for (auto& in_var : in_data_vars) {
IR_NODE_LINK_TO(in_var, new_op_node);
}
for (auto& in_var : in_wgt_vars) {
IR_NODE_LINK_TO(in_var, new_op_node); IR_NODE_LINK_TO(in_var, new_op_node);
} }
for (auto& out_var : out_vars) { for (auto& out_var : out_data_vars) {
IR_OP_VAR_LINK(new_op_node, out_var);
}
for (auto& out_var : out_unused_vars) {
IR_OP_VAR_LINK(new_op_node, out_var); IR_OP_VAR_LINK(new_op_node, out_var);
} }
...@@ -193,6 +186,34 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode( ...@@ -193,6 +186,34 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(
auto& inst = new_op_node->AsStmt(); auto& inst = new_op_node->AsStmt();
inst.picked_kernel().SetContext( inst.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(inst.picked_kernel().target())); ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
}
void GenerateNPUProgramPass::GenNPUGraphOpNode(
const std::unique_ptr<SSAGraph>& graph,
const std::unordered_set<Node*>& op_nodes,
int sub_id) {
std::unordered_set<Node*> in_data_vars;
std::unordered_set<Node*> in_wgt_vars;
std::unordered_set<Node*> out_data_vars;
std::unordered_set<Node*> out_unused_vars;
FindInputOutputVars(
op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
auto nodes2rm = GetNode2rm(
op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
auto model_name =
BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
auto any_op = (*op_nodes.begin())->AsStmt().op();
InsertNewNode(graph,
model_name,
any_op->scope(),
any_op->valid_places(),
in_data_vars,
in_wgt_vars,
out_data_vars,
out_unused_vars);
GraphSafeRemoveNodes(graph.get(), nodes2rm); GraphSafeRemoveNodes(graph.get(), nodes2rm);
} }
...@@ -215,7 +236,7 @@ void GenerateNPUProgramPass::ConvertSubgraph( ...@@ -215,7 +236,7 @@ void GenerateNPUProgramPass::ConvertSubgraph(
for (int id = 1; id <= sub_num; ++id) { for (int id = 1; id <= sub_num; ++id) {
LOG(INFO) << "Converting subgraph_id:" << id; LOG(INFO) << "Converting subgraph_id:" << id;
GenNPUGraphOpNode(graph, id, nodes_all.at(id)); GenNPUGraphOpNode(graph, nodes_all.at(id), id);
} }
} }
...@@ -226,14 +247,21 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -226,14 +247,21 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
const auto& op_map = bridges.AllFunctions(); const auto& op_map = bridges.AllFunctions();
std::vector<std::string> supported_op_types; std::vector<std::string> supported_op_types;
for (auto& i : op_map) { for (auto& i : op_map) {
LOG(INFO) << i.first; LOG(INFO) << "Supported type: " << i.first;
supported_op_types.push_back(i.first); supported_op_types.push_back(i.first);
} }
try {
int num_subgraph = FuseSubgraph(graph, supported_op_types); int num_subgraph = FuseSubgraph(graph, supported_op_types);
LOG(INFO) << "detected " << num_subgraph << " NPU subgraph"; LOG(INFO) << "detected " << num_subgraph << " NPU subgraph";
InferOnce(graph); InferOnce(graph);
ConvertSubgraph(graph, num_subgraph); ConvertSubgraph(graph, num_subgraph);
} catch (...) {
// exception = true;
LOG(WARNING) << "Build NPU graph failed";
}
LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get()); LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
for (auto& item : graph->StmtTopologicalOrder()) { for (auto& item : graph->StmtTopologicalOrder()) {
......
...@@ -38,31 +38,35 @@ class GenerateNPUProgramPass : public SubgraphProgramPass { ...@@ -38,31 +38,35 @@ class GenerateNPUProgramPass : public SubgraphProgramPass {
std::unique_ptr<RuntimeProgram> GenProgram(); std::unique_ptr<RuntimeProgram> GenProgram();
protected: protected:
// sort nodes to operational sequence
void SubgraphSortHelper(Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
std::vector<Node*>* ret);
// nodes2cvt: op nodes to convert // nodes2cvt: op nodes to convert
// cvted_vars: converted var nodes // return cvted_vars: converted var nodes
// nodes2rm: op nodes and var nodes that need to be removed void CvtAllOpNodes(const std::vector<Node*>& nodes2cvt,
void CvtOpNodes(const std::vector<Node*>& nodes2cvt,
lite::npu::bridge::node_map_type* cvted_vars); lite::npu::bridge::node_map_type* cvted_vars);
// achieve input and output vars/cvted_vars; std::shared_ptr<ge::Operator> CvtVarNode(lite::mir::Node* var_node,
// achieve all nodes to remove const Scope* scope);
void GetIOVars(const std::vector<Node*>& nodes2cvt,
const lite::npu::bridge::node_map_type& cvted_vars, std::string BuildNPUGraph(const std::unordered_set<Node*>& op_nodes,
std::unordered_set<const Node*>* nodes2rm, const std::unordered_set<Node*>& in_data_vars,
std::vector<Node*>* in_vars, const std::unordered_set<Node*>& out_data_vars,
std::vector<Node*>* out_vars, int sub_id);
lite::npu::bridge::node_map_type* in_cvted_vars,
lite::npu::bridge::node_map_type* out_cvted_vars); cpp::OpDesc GenGraphOpDesc(const std::string& model_name,
const std::vector<std::string>& in_var_names,
const std::vector<std::string>& out_var_names);
void InsertNewNode(const std::unique_ptr<SSAGraph>& graph,
const std::string& model_name,
Scope* scope,
const std::vector<Place>& valid_places,
std::unordered_set<Node*> in_data_vars,
std::unordered_set<Node*> in_wgt_vars,
std::unordered_set<Node*> out_data_vars,
std::unordered_set<Node*> out_unused_vars);
void GenNPUGraphOpNode(const std::unique_ptr<SSAGraph>& graph, void GenNPUGraphOpNode(const std::unique_ptr<SSAGraph>& graph,
int sub_id, const std::unordered_set<Node*>& nodes_all,
const std::unordered_set<Node*>& nodes_all); int sub_id);
void ConvertSubgraph(const std::unique_ptr<SSAGraph>& graph, int sub_num); void ConvertSubgraph(const std::unique_ptr<SSAGraph>& graph, int sub_num);
......
...@@ -39,8 +39,11 @@ namespace lite { ...@@ -39,8 +39,11 @@ namespace lite {
void TestModel(lite::Predictor* predictor, void TestModel(lite::Predictor* predictor,
const std::vector<Place>& valid_places, const std::vector<Place>& valid_places,
const std::string& model_dir) { const std::string& model_dir) {
predictor->Build( predictor->Build(model_dir,
model_dir, "", "", Place{TARGET(kARM), PRECISION(kFloat)}, valid_places); model_dir + "/model",
model_dir + "/params",
Place{TARGET(kARM), PRECISION(kFloat)},
valid_places);
auto* input_tensor = predictor->GetInput(0); auto* input_tensor = predictor->GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>( input_tensor->Resize(DDim(std::vector<DDim::value_type>(
...@@ -51,13 +54,6 @@ void TestModel(lite::Predictor* predictor, ...@@ -51,13 +54,6 @@ void TestModel(lite::Predictor* predictor,
data[i] = 1; data[i] = 1;
} }
if (std::find(valid_places.begin(),
valid_places.end(),
Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) {
// TODO(TJ): change if valid npu so try use it, add rollback and move to api
predictor->GenNPURuntimeProgram();
}
predictor->Run(); predictor->Run();
if (model_dir != FLAGS_optimized_model && if (model_dir != FLAGS_optimized_model &&
std::find(valid_places.begin(), std::find(valid_places.begin(),
......
...@@ -26,6 +26,105 @@ namespace lite { ...@@ -26,6 +26,105 @@ namespace lite {
namespace mir { namespace mir {
namespace subgraph { namespace subgraph {
void SubgraphProgramPass::SortHelper(
Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
std::vector<Node*>* ret) {
for (auto& var_node : node->inlinks) {
if (var_node->inlinks.empty()) continue;
auto* op_node = var_node->inlinks.front();
if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
SortHelper(op_node, nodes_all, visited_nodes, ret);
}
}
ret->push_back(node);
visited_nodes->insert(node);
}
std::vector<Node*> SubgraphProgramPass::GetTopologicalOrder(
const std::unordered_set<Node*>& nodes) {
std::unordered_set<const Node*> visited;
std::vector<Node*> ret;
for (auto& node : nodes) {
if (!node->IsStmt()) continue;
if (visited.count(node)) continue;
SortHelper(node, nodes, &visited, &ret);
}
return ret;
}
void SubgraphProgramPass::FindInputOutputVars(
const std::unordered_set<Node*>& op_nodes,
std::unordered_set<Node*>* in_data_vars,
std::unordered_set<Node*>* in_wgt_vars,
std::unordered_set<Node*>* out_data_vars,
std::unordered_set<Node*>* out_unused_vars) {
for (auto& op_node : op_nodes) {
for (auto& in_var : op_node->inlinks) {
if (in_var->AsArg().is_weight) {
in_wgt_vars->insert(in_var);
continue;
}
if (!in_var->inlinks.empty()) {
// var can only come from one op node, so use front
auto* pre_op_node = in_var->inlinks.front();
if (op_nodes.count(pre_op_node)) {
continue;
}
}
in_data_vars->insert(in_var);
}
for (auto& out_var : op_node->outlinks) {
if (out_var->outlinks.empty()) {
// the next op is empty so this var is actually unused
out_unused_vars->insert(out_var);
continue;
}
// var can have more than one next op node
// so, if any one in the op_nodes then continue
bool next_op_in_nodes = false;
for (auto& next_op_node : out_var->outlinks) {
if (op_nodes.count(next_op_node)) {
next_op_in_nodes = true;
}
}
if (next_op_in_nodes) {
continue;
}
out_data_vars->insert(out_var);
}
}
}
std::unordered_set<const Node*> SubgraphProgramPass::GetNode2rm(
const std::unordered_set<Node*>& op_nodes,
const std::vector<std::unordered_set<Node*>>& excluded_nodes) {
std::unordered_set<const Node*> nodes2rm(op_nodes.begin(), op_nodes.end());
for (auto& op_node : op_nodes) {
for (auto& in_var : op_node->inlinks) {
if (!nodes2rm.count(in_var)) {
nodes2rm.insert(in_var);
}
}
for (auto& out_var : op_node->outlinks) {
if (!nodes2rm.count(out_var)) {
nodes2rm.insert(out_var);
}
}
}
// some nodes should not be removed
for (auto& e : excluded_nodes) {
for (auto& i : e) {
if (nodes2rm.count(i)) {
nodes2rm.erase(i);
}
}
}
return nodes2rm;
}
void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) { void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
for (auto& item : graph->StmtTopologicalOrder()) { for (auto& item : graph->StmtTopologicalOrder()) {
if (!item->IsStmt()) continue; if (!item->IsStmt()) continue;
...@@ -127,13 +226,10 @@ int SubgraphProgramPass::FuseSubgraphID( ...@@ -127,13 +226,10 @@ int SubgraphProgramPass::FuseSubgraphID(
for (auto& j : i->outlinks) { for (auto& j : i->outlinks) {
if (j->IsStmt()) { if (j->IsStmt()) {
auto& jstmt = j->AsStmt(); auto& jstmt = j->AsStmt();
// LOG(INFO) << "initial: "<<jstmt.op_type()<<"
// :"<<jstmt.subgraph_id();
if (jstmt.subgraph_id() == 0) inputvar = 1; if (jstmt.subgraph_id() == 0) inputvar = 1;
} }
} }
} }
// LOG(INFO) << "initial: "<<stmt.op_type()<<" :"<<stmt.subgraph_id();
if (inputvar == 1) { if (inputvar == 1) {
for (auto& i : item->outlinks) i_nodes_[sub_id].insert(i); for (auto& i : item->outlinks) i_nodes_[sub_id].insert(i);
} }
......
...@@ -54,7 +54,32 @@ class SubgraphProgramPass : public ProgramPass { ...@@ -54,7 +54,32 @@ class SubgraphProgramPass : public ProgramPass {
// std::unique_ptr<SSAGraph>& graph, int sub_num); // std::unique_ptr<SSAGraph>& graph, int sub_num);
void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0); void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0);
// Below function cloud be useful in child classes //
// Sort and return the topology order of nodes set
std::vector<Node*> GetTopologicalOrder(
const std::unordered_set<Node*>& nodes);
// find all input data vars, input weight vars,
// output data vars and output vars from the nodes
void FindInputOutputVars(const std::unordered_set<Node*>& op_nodes,
std::unordered_set<Node*>* in_data_vars,
std::unordered_set<Node*>* in_wgt_vars,
std::unordered_set<Node*>* out_data_vars,
std::unordered_set<Node*>* out_unused_vars);
// return the node to remove in the subgraph
std::unordered_set<const Node*> GetNode2rm(
const std::unordered_set<Node*>& op_nodes,
const std::vector<std::unordered_set<Node*>>& excluded_nodes);
private: private:
// sort nodes to operational sequence
void SortHelper(Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
std::vector<Node*>* ret);
// {1: {nodes2rm_in_subgraph1, ...}, // {1: {nodes2rm_in_subgraph1, ...},
// 2: {nodes2rm_in_subgraph2, ...}} // 2: {nodes2rm_in_subgraph2, ...}}
// delete nodes // delete nodes
......
...@@ -109,6 +109,28 @@ class Optimizer { ...@@ -109,6 +109,28 @@ class Optimizer {
// Generate a new program based on the mir graph. // Generate a new program based on the mir graph.
std::unique_ptr<RuntimeProgram> GenRuntimeProgram() { std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
#ifdef LITE_WITH_NPU
if (std::find(valid_places_.begin(),
valid_places_.end(),
Place{TARGET(kNPU), PRECISION(kFloat)}) !=
valid_places_.end()) {
CheckInputDimsNotEmpty(exec_scope_);
auto pass = mir::PassManager::Global()
.LookUp<mir::subgraph::GenerateNPUProgramPass>(
"generate_npu_program_pass");
pass->Apply(graph_);
auto program = pass->GenProgram();
if (program) {
CHECK(exec_scope_);
program->set_exec_scope(exec_scope_);
return program;
} else {
LOG(WARNING) << "Build NPU graph failed.";
}
}
#endif
auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>( auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
"generate_program_pass"); "generate_program_pass");
pass->Apply(graph_); pass->Apply(graph_);
...@@ -131,24 +153,6 @@ class Optimizer { ...@@ -131,24 +153,6 @@ class Optimizer {
} }
} }
std::unique_ptr<RuntimeProgram> GenNPURuntimeProgram() {
#ifdef LITE_WITH_NPU
CheckInputDimsNotEmpty(exec_scope_);
auto pass = mir::PassManager::Global()
.LookUp<mir::subgraph::GenerateNPUProgramPass>(
"generate_npu_program_pass");
pass->Apply(graph_);
auto program = pass->GenProgram();
CHECK(exec_scope_);
program->set_exec_scope(exec_scope_);
return program;
#else
LOG(WARNING) << "Not compiled with NPU but use it!";
return GenRuntimeProgram();
#endif
}
void InitTargetTypeTransformPass() { void InitTargetTypeTransformPass() {
auto* pass = auto* pass =
mir::PassManager::Global().LookUp<mir::TypeTargetTransformPass>( mir::PassManager::Global().LookUp<mir::TypeTargetTransformPass>(
......
...@@ -247,6 +247,7 @@ void LoadModelPb(const std::string &model_dir, ...@@ -247,6 +247,7 @@ void LoadModelPb(const std::string &model_dir,
} }
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
auto main_block = pb_proto_prog.blocks(0);
for (auto &op : main_block.ops()) { for (auto &op : main_block.ops()) {
LOG(INFO) << "op type:" << op.type(); LOG(INFO) << "op type:" << op.type();
if (op.type() != "graph_op") { if (op.type() != "graph_op") {
......
...@@ -113,48 +113,6 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor, ...@@ -113,48 +113,6 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor,
return out_tensor; return out_tensor;
} }
std::shared_ptr<ge::Operator> CvtNode(lite::mir::Node* var_node,
const Scope* scope) {
CHECK(var_node->IsArg());
const auto& arg = var_node->AsArg();
VLOG(4) << "Convert var node " << arg.name;
auto* var = scope->FindVar(arg.name);
CHECK(var);
auto* tensor = var->GetMutable<lite::Tensor>();
CHECK(tensor);
auto dims = tensor->dims();
if (arg.is_weight) {
auto wgt = std::make_shared<ge::op::Const>(arg.name);
LOG(INFO) << "in convert const:" << arg.name;
LOG(INFO) << dims;
wgt->set_attr_value(CvtFromLiteTensor(tensor));
auto odesc = wgt->GetOutputDesc(0);
LOG(INFO) << "const ----";
for (auto i : odesc.GetShape().GetDims()) {
LOG(INFO) << ";;;;;;;;;------: " << i;
}
return wgt;
} else {
CHECK_EQ(dims.size(), 4);
LOG(INFO) << "in convert data:" << arg.name;
LOG(INFO) << dims;
// TODO(TJ): support more types and dims size
ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
ge::Format::FORMAT_NCHW,
ge::DataType::DT_FLOAT);
// auto size = desc.GetShape().GetShapeSize();
// ge::TensorUtils::SetSize(desc, size*sizeof(float));
// ge::TensorUtils::SetRealDimCnt(desc, 4);
auto data = std::make_shared<ge::op::Data>(arg.name);
data->update_input_desc_x(desc);
return data;
}
return nullptr;
}
bool HasInputArg(const OpInfo* op_info, bool HasInputArg(const OpInfo* op_info,
const Scope* scope, const Scope* scope,
const std::string& argname) { const std::string& argname) {
......
...@@ -84,11 +84,6 @@ ge::TensorPtr CreateTensorAndFillData(T value, ...@@ -84,11 +84,6 @@ ge::TensorPtr CreateTensorAndFillData(T value,
return CreateTensorAndFillData(data, shape, format); return CreateTensorAndFillData(data, shape, format);
} }
std::shared_ptr<ge::Operator> CvtNode2Tensor(const lite::mir::Node* arg_node);
std::shared_ptr<ge::Operator> CvtNode(lite::mir::Node* var_node,
const Scope* scope);
bool HasInputArg(const OpInfo* op_info, bool HasInputArg(const OpInfo* op_info,
const Scope* scope, const Scope* scope,
const std::string& argname); const std::string& argname);
......
...@@ -62,6 +62,7 @@ function cmake_npu { ...@@ -62,6 +62,7 @@ function cmake_npu {
-DWITH_LITE=ON \ -DWITH_LITE=ON \
-DLITE_WITH_CUDA=OFF \ -DLITE_WITH_CUDA=OFF \
-DLITE_WITH_X86=OFF \ -DLITE_WITH_X86=OFF \
-DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_ARM=ON \ -DLITE_WITH_ARM=ON \
-DWITH_ARM_DOTPROD=ON \ -DWITH_ARM_DOTPROD=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册