diff --git a/src/framework/operator.h b/src/framework/operator.h index 2de3a9536702ee17daabae969d81e979fc66b0c8..cb27985244a1dd9e92a54edce9f15fd3d8defaad 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -151,7 +151,7 @@ class FusionOpMatcher : PaddleMobileObject { virtual Node &BeginNode() { return node_; } - std::string BeginType() { return node_.BeginType(); } + std::string BeginType() { return node_.Type(); } protected: Node node_; diff --git a/src/framework/program/block_desc.cpp b/src/framework/program/block_desc.cpp index 7342abe284406a66b654639a171e23f9d6d680d7..21322f0825636a321b022220e535cad0e4b8cf41 100644 --- a/src/framework/program/block_desc.cpp +++ b/src/framework/program/block_desc.cpp @@ -25,13 +25,7 @@ std::vector> BlockDesc::Vars() const { return res; } -std::vector> BlockDesc::Ops() const { - std::vector> res; - for (const auto &op : ops_) { - res.push_back(op); - } - return res; -} +std::vector> BlockDesc::Ops() const { return ops_; } BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc) : index_(desc->idx), parent_index_(desc->idx) { diff --git a/src/framework/program/block_desc.h b/src/framework/program/block_desc.h index 1a22714b52a9454a561db1d6ae0cd1f4c215633a..84d7a90fc11ddf360eacb01be9456ced4a30dad8 100644 --- a/src/framework/program/block_desc.h +++ b/src/framework/program/block_desc.h @@ -26,6 +26,7 @@ class BlockDesc : PaddleMobileObject { public: friend class Node; friend class ProgramOptimize; + BlockDesc() {} BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc); BlockDesc(const BlockDesc &block_desc) : index_(block_desc.index_), parent_index_(block_desc.parent_index_) { @@ -43,6 +44,8 @@ class BlockDesc : PaddleMobileObject { const int &ID() const { return index_; } + const bool &MultiThread() const { return multi_thread_; } + const int &Parent() const { return parent_index_; } bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const { @@ -58,6 +61,7 @@ class BlockDesc : PaddleMobileObject { private: int index_; + bool multi_thread_; int parent_index_; std::vector> ops_; std::unordered_map> vars_; diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp index 820fa6a443c62c4cfdb38f4d42e6d7805371c2d3..31377222db86bb47ac09efd989fa3a512ca08a8e 100644 --- a/src/framework/program/program-optimize/node.cpp +++ b/src/framework/program/program-optimize/node.cpp @@ -45,17 +45,6 @@ bool Node::operator==(const Node &in) { return true; } -// std::shared_ptr Node::MatchTheFirstNode(std::string type){ -// -// for (const auto &node : outputs_){ -// if (node->type_ == type){ -// return node; -// }else{ -// -// } -// } -//} - std::vector> Node::OpDescs(uint size) { std::vector> op_descs; OpDescs(size - 1, &op_descs); @@ -75,21 +64,40 @@ void Node::OpDescs(uint index, void Node::OpDescs(std::vector> *op_desc, Node *node, bool adding_thread, int thread_num) { - bool can_add_split = false; if (outputs_.size() > 1) { + adding_thread = false; + } + + bool can_add_split = false; + // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持 + if (outputs_.size() > 1 && + op_input_output_key[op_desc_->type_].second.size() == 1) { can_add_split = true; - if (op_input_output_key[op_desc_->type_].second.size() != 1) { - DLOG << "当前 op desc 输出数不为 1 "; - can_add_split = false; - } + + // 遍历当前节点的 output 节点 for (const auto &output : outputs_) { - if (op_input_output_key.find(output->op_desc_->type_) != - op_input_output_key.end()) { - auto inputs_and_outputs = op_input_output_key[output->op_desc_->type_]; - auto outputs_of_output = - output->op_desc_->Output(inputs_and_outputs.second[0]); - auto inputs_of_output = - output->op_desc_->Input(inputs_and_outputs.first[0]); + // 不支持 output 有多个 output 的情况 + if (output->outputs_.size() > 0) { + can_add_split = false; + break; + } + + //与节点关联的 OpDesc + std::shared_ptr &op_desc = output->op_desc_; + + //获取这个 op 的 inputs key 和 outputs key + auto inputs_and_outputs = op_input_output_key[op_desc->type_]; + + //判断现在 是否存在这个 op + //判断这个 output 和 input key 的 size 等于 1 + if (op_input_output_key.find(op_desc->type_) != + op_input_output_key.end() && + inputs_and_outputs.first.size() == 1 && + inputs_and_outputs.second.size() == 1) { + auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]); + auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]); + + // 判断一下, 如果输入和输出没有同名, 是支持的 for (int i = 0; i < inputs_of_output.size(); ++i) { std::string input_of_output = inputs_of_output[i]; for (int j = 0; j < outputs_of_output.size(); ++j) { @@ -101,7 +109,7 @@ void Node::OpDescs(std::vector> *op_desc, } } } - } else { + } else { // 如果模型中包含没有的 op, 则不支持添加 split DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_; can_add_split = false; } @@ -124,12 +132,10 @@ void Node::OpDescs(std::vector> *op_desc, if (can_add_split) { adding_thread = true; - std::shared_ptr split_op_desc = - std::make_shared(); + std::shared_ptr split_op_desc = std::make_shared(); split_op_desc->type_ = G_OP_TYPE_SPLIT; auto outputs = this->op_desc_->Output( op_input_output_key[this->op_desc_->Type()].second[0]); - split_op_desc->inputs_ = { {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}}; auto &split_outputs = @@ -157,41 +163,12 @@ std::vector> Node::OpDescs() { return op_descs; } -std::string Node::ToString(std::string blank, const Node *node) const { - std::stringstream ss; - ss << type_ << "-> \n"; - - if (inputs_.size() > 1 && node != inputs_.back()) { - return ss.str(); - } else if (inputs_.size() > 1 && node == inputs_.back()) { - ss << "\n" << blank << type_ << "\n"; - } - - for (int i = 0; i < outputs_.size(); ++i) { - ss << blank << outputs_[i]->ToString(blank + " ", this) << ""; - } - return ss.str(); -} - -std::string Node::ToString() const { return this->ToString(" ", this); } - std::shared_ptr Node::To(int size) { std::shared_ptr node = std::make_shared(); this->To(size - 1, node); return node; } -// Node &Node::To(int size) { -// if (size == 1) { -// this->outputs_.clear(); -// } -// -// for (int j = 0; j < this->outputs_.size(); ++j) { -// outputs_[j]->To(size - 1); -// } -// return *this; -//} - void Node::To(int index, std::shared_ptr node) { node->type_ = this->type_; if (index != 0) { @@ -268,6 +245,24 @@ void Node::Folder( } } +std::string Node::ToString(std::string blank, const Node *node) const { + std::stringstream ss; + ss << type_ << "-> \n"; + + if (inputs_.size() > 1 && node != inputs_.back()) { + return ss.str(); + } else if (inputs_.size() > 1 && node == inputs_.back()) { + ss << "\n" << blank << type_ << "\n"; + } + + for (int i = 0; i < outputs_.size(); ++i) { + ss << blank << outputs_[i]->ToString(blank + " ", this) << ""; + } + return ss.str(); +} + +std::string Node::ToString() const { return this->ToString(" ", this); } + void Node::Description() { if (op_desc_.get()) { DLOG << *op_desc_; diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h index 5dd1a3acbf5e662901bf7591de5f12cc7f47ef76..da7e26a9ac06abecaa0fb34fb57b5d24f1eced7e 100644 --- a/src/framework/program/program-optimize/node.h +++ b/src/framework/program/program-optimize/node.h @@ -27,6 +27,8 @@ namespace paddle_mobile { namespace framework { class Node : PaddleMobileObject { + friend class ProgramOptimize; + public: Node() {} explicit Node(const std::string &type) : type_(type) {} @@ -42,8 +44,8 @@ class Node : PaddleMobileObject { std::map> change_map); std::vector> OpDescs(uint size); std::vector> OpDescs(); - std::shared_ptr OpDesc() { return op_desc_; } - std::string BeginType() { return type_; } + std::shared_ptr OpDescOfNode() { return op_desc_; } + std::string Type() { return type_; } void Description(); private: diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp index 737fed9bd56bdec92774ba364e035ba581258e57..8b0bf295262e51b07134d9b9f010c56fea62cce9 100644 --- a/src/framework/program/program-optimize/program_optimize.cpp +++ b/src/framework/program/program-optimize/program_optimize.cpp @@ -19,11 +19,12 @@ namespace paddle_mobile { namespace framework { -// std::shared_ptr ProgramOptimize::Optimize() {} - std::shared_ptr ProgramOptimize::FushionOptimize( - std::shared_ptr ori_des) { - ProgramDesc *optimize_program = new ProgramDesc(*ori_des); + std::shared_ptr ori_des, bool add_split) { + // ProgramDesc *optimize_program = new ProgramDesc(*ori_des); + std::shared_ptr optimize_program = + std::make_shared(*ori_des); + current_block_ = optimize_program->Blocks().size(); for (int i = 0; i < optimize_program->Blocks().size(); ++i) { std::unordered_map> output_nodes; @@ -96,10 +97,145 @@ std::shared_ptr ProgramOptimize::FushionOptimize( } // DLOG << "node: \n" << *begin_node; - block->ops_ = begin_node->OpDescs(); + + std::vector> op_descs; + GenerateOps(&op_descs, begin_node.get()); + block->ops_ = op_descs; + } + + for (int m = 0; m < new_blocks_.size(); ++m) { + std::shared_ptr new_block = new_blocks_[m]; + new_block->index_ = m + ori_des->blocks_.size(); + optimize_program->blocks_.push_back(new_block); + } + return optimize_program; +} + +void ProgramOptimize::GenerateOps( + std::vector> *op_desc, Node *input_node, + Node *current_node, bool adding_thread, int thread_num, + std::shared_ptr new_block) { + if (current_node->outputs_.size() > 1) { + adding_thread = false; + } + + bool can_add_split = false; + // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持 + if (current_node->outputs_.size() > 1 && + op_input_output_key[current_node->op_desc_->type_].second.size() == 1) { + can_add_split = true; + + // 遍历当前节点的 output 节点 + for (const auto &output : current_node->outputs_) { + // 不支持 output 有多个 output 的情况 + if (output->outputs_.size() > 1) { + DLOG << "don't support multi output of output"; + can_add_split = false; + break; + } + + //与节点关联的 OpDesc + std::shared_ptr &op_desc = output->op_desc_; + + //获取这个 op 的 inputs key 和 outputs key + auto inputs_and_outputs = op_input_output_key[op_desc->type_]; + + //判断现在 是否存在这个 op + //判断这个 output 和 input key 的 size 等于 1 + if (op_input_output_key.find(op_desc->type_) != + op_input_output_key.end() && + inputs_and_outputs.first.size() == 1 && + inputs_and_outputs.second.size() == 1) { + auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]); + auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]); + + // 判断一下, 如果输入和输出没有同名, 是支持的 + for (int i = 0; i < inputs_of_output.size(); ++i) { + std::string input_of_output = inputs_of_output[i]; + for (int j = 0; j < outputs_of_output.size(); ++j) { + std::string output_of_output = outputs_of_output[j]; + if (input_of_output == output_of_output) { + DLOG << "output的 output 包含 input" << input_of_output; + can_add_split = false; + break; + } + } + } + } else { // 如果模型中包含没有的 op, 则不支持添加 split + DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_; + can_add_split = false; + } + } + } + + if (current_node->inputs_.size() > 1 && + input_node != current_node->inputs_.back()) { + return; + } else if (current_node->inputs_.size() > 1 && + input_node == current_node->inputs_.back()) { + new_block.reset(); + adding_thread = false; + op_desc->push_back(current_node->op_desc_); + } else { + if (new_block.get() && adding_thread) { + new_block->ops_.push_back(current_node->op_desc_); + } else { + op_desc->push_back(current_node->op_desc_); + } + } + if (adding_thread) { + Attribute attr; + attr.Set(thread_num); + current_node->op_desc_->attrs_["thread"] = attr; + } + + if (can_add_split) { + new_block = std::make_shared(); + new_block->multi_thread_ = true; + new_block->index_ = current_block_; + new_blocks_.push_back(new_block); + + adding_thread = true; + std::shared_ptr split_op_desc = std::make_shared(); + split_op_desc->type_ = G_OP_TYPE_SPLIT; + auto outputs = current_node->op_desc_->Output( + op_input_output_key[current_node->op_desc_->Type()].second[0]); + split_op_desc->inputs_ = { + {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}}; + auto &split_outputs = + split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]]; + for (const auto &output : current_node->outputs_) { + split_outputs.push_back(outputs[0]); + } + + Attribute attr; + attr.Set(current_block_); + split_op_desc->attrs_["block_id"] = attr; + + op_desc->push_back(split_op_desc); + current_block_++; + } + + for (int i = 0; i < current_node->outputs_.size(); ++i) { + auto &output = current_node->outputs_[i]; + if (can_add_split) { + GenerateOps(op_desc, current_node, output.get(), adding_thread, i, + new_block); + } else { + GenerateOps(op_desc, current_node, output.get(), adding_thread, + thread_num, new_block); + } } - std::shared_ptr shared_optimzie(optimize_program); - return shared_optimzie; } + +void ProgramOptimize::GenerateOps( + std::vector> *op_descs, + Node *begin_node) { + // std::vector> *op_desc, + // Node *input_node, Node *current_node, bool adding_thread, int + // thread_num + this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr); +} + } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h index 3839fa1e36ba0bbe580dac05af2c7ba6185f9b6c..32d8d1fa9141f14e58494e7c83f032f0f09ad8e2 100644 --- a/src/framework/program/program-optimize/program_optimize.h +++ b/src/framework/program/program-optimize/program_optimize.h @@ -28,12 +28,17 @@ class ProgramOptimize { public: ProgramOptimize() {} std::shared_ptr FushionOptimize( - std::shared_ptr ori_des); + std::shared_ptr ori_des, bool add_split = false); private: - // std::shared_ptr ori_desc_; - std::vector>> - outputs_nodes_; + int current_block_; + std::vector> new_blocks_; + + void GenerateOps(std::vector> *op_descs, + Node *begin_node); + void GenerateOps(std::vector> *op_desc, + Node *input_node, Node *current_node, bool adding_thread, + int thread_num, std::shared_ptr new_block); }; } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/program/program_desc.cpp b/src/framework/program/program_desc.cpp index 071f5cf5719853d60e0345cb1c3ceff3761b8401..8483e1e5d68eaf27c84ca62fee1a571dca42a65a 100644 --- a/src/framework/program/program_desc.cpp +++ b/src/framework/program/program_desc.cpp @@ -32,11 +32,13 @@ void ProgramDesc::Description(std::string header) { if (header.size()) { LOG(kLOG_INFO) << header; } - for (const auto &block : this->blocks_) { + + for (int i = 0; i < this->blocks_.size(); ++i) { + auto block = this->blocks_[i]; LOG(kLOG_DEBUG) << "block: " << block->ID(); LOG(kLOG_INFO) << "block ops size: " << block->Ops().size(); for (int j = 0; j < block->Ops().size(); ++j) { - const auto &op = block->Ops()[j]; + auto op = block->Ops()[j]; LOG(kLOG_DEBUG1) << "op: " << op->Type(); for (auto &input : op->GetInputs()) { LOG(kLOG_DEBUG2) << "input parameter: " << input.first; @@ -71,6 +73,9 @@ void ProgramDesc::Description(std::string header) { } } } + + for (const auto &block : this->blocks_) { + } #endif } diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp index 73aa9953cfcbc8efe0ed9d3bf094455cfbb4fe6c..1da52fa8d469bd81d043843d7bcca3a7b01f6663 100644 --- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp @@ -32,7 +32,7 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { std::vector paddings = param.Paddings(); std::vector dilations = param.Dilations(); - DLOG << " compute end get Attrs " << strides[0]; + // DLOG << " compute end get Attrs " << strides[0]; const int batch_size = static_cast(input->dims()[0]); @@ -59,17 +59,17 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } - DLOG << " col_shape = " << col_shape; - DLOG << " col_matrix_shape = " << col_matrix_shape; + // DLOG << " col_shape = " << col_shape; + // DLOG << " col_matrix_shape = " << col_matrix_shape; framework::DDim input_shape = framework::slice_ddim( input->dims(), 1, static_cast(input->dims().size())); - DLOG << " input_shape = " << input_shape; + // DLOG << " input_shape = " << input_shape; framework::DDim filter_matrix_shape = {filter.dims()[0], filter.numel() / filter.dims()[0]}; filter.Resize(filter_matrix_shape); - DLOG << " filter.dims() = " << filter.dims(); + // DLOG << " filter.dims() = " << filter.dims(); framework::DDim output_matrix_shape = { output->dims()[1], @@ -85,8 +85,8 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - DLOG << " in_batch.dims() = " << in_batch.dims(); - DLOG << " out_batch.dims() = " << out_batch.dims(); + // DLOG << " in_batch.dims() = " << in_batch.dims(); + // DLOG << " out_batch.dims() = " << out_batch.dims(); for (int g = 0; g < groups; g++) { Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); @@ -109,9 +109,9 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - DLOG << " out_slice " << out_slice.dims(); - DLOG << " filter_slice " << filter_slice.dims(); - DLOG << " col_matrix " << col_matrix.dims(); + // DLOG << " out_slice " << out_slice.dims(); + // DLOG << " filter_slice " << filter_slice.dims(); + // DLOG << " col_matrix " << col_matrix.dims(); math::matmul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, static_cast(0));