Merge remote-tracking branch 'upstream/develop' into develop

5a9fe9f4 · wangliu · 0d53906f · 8a6f5a88 · 5a9fe9f4 · 5a9fe9f4
9 changed file
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -151,7 +151,7 @@ class FusionOpMatcher : PaddleMobileObject {
  virtual Node &BeginNode() { return node_; }
-  std::string BeginType() { return node_.BeginType(); }
+  std::string BeginType() { return node_.Type(); }
 protected:
  Node node_;

--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -25,13 +25,7 @@ std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
  return res;
 }
-std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const {
+std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }
-  std::vector<std::shared_ptr<OpDesc>> res;
-  for (const auto &op : ops_) {
-    res.push_back(op);
-  }
-  return res;
-}
 BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
    : index_(desc->idx), parent_index_(desc->idx) {

--- a/src/framework/program/block_desc.h
+++ b/src/framework/program/block_desc.h
@@ -26,6 +26,7 @@ class BlockDesc : PaddleMobileObject {
 public:
  friend class Node;
  friend class ProgramOptimize;
+  BlockDesc() {}
  BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
  BlockDesc(const BlockDesc &block_desc)
      : index_(block_desc.index_), parent_index_(block_desc.parent_index_) {
@@ -43,6 +44,8 @@ class BlockDesc : PaddleMobileObject {
  const int &ID() const { return index_; }
+  const bool &MultiThread() const { return multi_thread_; }
  const int &Parent() const { return parent_index_; }
  bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const {
@@ -58,6 +61,7 @@ class BlockDesc : PaddleMobileObject {
 private:
  int index_;
+  bool multi_thread_;
  int parent_index_;
  std::vector<std::shared_ptr<OpDesc>> ops_;
  std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -45,17 +45,6 @@ bool Node::operator==(const Node &in) {
  return true;
 }
-// std::shared_ptr<Node> Node::MatchTheFirstNode(std::string type){
-//
-//  for (const auto &node : outputs_){
-//    if (node->type_ == type){
-//      return node;
-//    }else{
-//
-//    }
-//  }
-//}
 std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(uint size) {
  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
  OpDescs(size - 1, &op_descs);
@@ -75,21 +64,40 @@ void Node::OpDescs(uint index,
 void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                   Node *node, bool adding_thread, int thread_num) {
-  bool can_add_split = false;
  if (outputs_.size() > 1) {
+    adding_thread = false;
+  }
+  bool can_add_split = false;
+  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
+  if (outputs_.size() > 1 &&
+      op_input_output_key[op_desc_->type_].second.size() == 1) {
    can_add_split = true;
-    if (op_input_output_key[op_desc_->type_].second.size() != 1) {
-      DLOG << "当前 op desc 输出数不为 1 ";
+    // 遍历当前节点的 output 节点
-      can_add_split = false;
-    }
    for (const auto &output : outputs_) {
-      if (op_input_output_key.find(output->op_desc_->type_) !=
+      // 不支持 output 有多个 output 的情况
-          op_input_output_key.end()) {
+      if (output->outputs_.size() > 0) {
-        auto inputs_and_outputs = op_input_output_key[output->op_desc_->type_];
+        can_add_split = false;
-        auto outputs_of_output =
+        break;
-            output->op_desc_->Output(inputs_and_outputs.second[0]);
+      }
-        auto inputs_of_output =
-            output->op_desc_->Input(inputs_and_outputs.first[0]);
+      //与节点关联的 OpDesc
+      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
+      //获取这个 op 的 inputs key 和 outputs key
+      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
+      //判断现在 是否存在这个 op
+      //判断这个 output 和 input key 的 size 等于 1
+      if (op_input_output_key.find(op_desc->type_) !=
+              op_input_output_key.end() &&
+          inputs_and_outputs.first.size() == 1 &&
+          inputs_and_outputs.second.size() == 1) {
+        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
+        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
+        // 判断一下, 如果输入和输出没有同名, 是支持的
        for (int i = 0; i < inputs_of_output.size(); ++i) {
          std::string input_of_output = inputs_of_output[i];
          for (int j = 0; j < outputs_of_output.size(); ++j) {
@@ -101,7 +109,7 @@ void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
            }
          }
        }
-      } else {
+      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
        can_add_split = false;
      }
@@ -124,12 +132,10 @@ void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
  if (can_add_split) {
    adding_thread = true;
-    std::shared_ptr<class OpDesc> split_op_desc =
+    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
-        std::make_shared<class OpDesc>();
    split_op_desc->type_ = G_OP_TYPE_SPLIT;
    auto outputs = this->op_desc_->Output(
        op_input_output_key[this->op_desc_->Type()].second[0]);
    split_op_desc->inputs_ = {
        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
    auto &split_outputs =
@@ -157,41 +163,12 @@ std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
  return op_descs;
 }
-std::string Node::ToString(std::string blank, const Node *node) const {
-  std::stringstream ss;
-  ss << type_ << "-> \n";
-  if (inputs_.size() > 1 && node != inputs_.back()) {
-    return ss.str();
-  } else if (inputs_.size() > 1 && node == inputs_.back()) {
-    ss << "\n" << blank << type_ << "\n";
-  }
-  for (int i = 0; i < outputs_.size(); ++i) {
-    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
-  }
-  return ss.str();
-}
-std::string Node::ToString() const { return this->ToString("  ", this); }
 std::shared_ptr<Node> Node::To(int size) {
  std::shared_ptr<Node> node = std::make_shared<Node>();
  this->To(size - 1, node);
  return node;
 }
-// Node &Node::To(int size) {
-//  if (size == 1) {
-//    this->outputs_.clear();
-//  }
-//
-//  for (int j = 0; j < this->outputs_.size(); ++j) {
-//    outputs_[j]->To(size - 1);
-//  }
-//  return *this;
-//}
 void Node::To(int index, std::shared_ptr<Node> node) {
  node->type_ = this->type_;
  if (index != 0) {
@@ -268,6 +245,24 @@ void Node::Folder(
  }
 }
+std::string Node::ToString(std::string blank, const Node *node) const {
+  std::stringstream ss;
+  ss << type_ << "-> \n";
+  if (inputs_.size() > 1 && node != inputs_.back()) {
+    return ss.str();
+  } else if (inputs_.size() > 1 && node == inputs_.back()) {
+    ss << "\n" << blank << type_ << "\n";
+  }
+  for (int i = 0; i < outputs_.size(); ++i) {
+    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
+  }
+  return ss.str();
+}
+std::string Node::ToString() const { return this->ToString("  ", this); }
 void Node::Description() {
  if (op_desc_.get()) {
    DLOG << *op_desc_;

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -27,6 +27,8 @@ namespace paddle_mobile {
 namespace framework {
 class Node : PaddleMobileObject {
+  friend class ProgramOptimize;
 public:
  Node() {}
  explicit Node(const std::string &type) : type_(type) {}
@@ -42,8 +44,8 @@ class Node : PaddleMobileObject {
      std::map<std::string, std::pair<std::string, std::string>> change_map);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
-  std::shared_ptr<framework::OpDesc> OpDesc() { return op_desc_; }
+  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
-  std::string BeginType() { return type_; }
+  std::string Type() { return type_; }
  void Description();
 private:

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -19,11 +19,12 @@ namespace paddle_mobile {
 namespace framework {
-// std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
-    std::shared_ptr<ProgramDesc> ori_des) {
+    std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
-  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
+  //  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
+  std::shared_ptr<ProgramDesc> optimize_program =
+      std::make_shared<ProgramDesc>(*ori_des);
+  current_block_ = optimize_program->Blocks().size();
  for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
    std::unordered_map<std::string, std::shared_ptr<Node>> output_nodes;
@@ -96,10 +97,145 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
    }
    //    DLOG << "node: \n" << *begin_node;
-    block->ops_ = begin_node->OpDescs();
+    std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
+    GenerateOps(&op_descs, begin_node.get());
+    block->ops_ = op_descs;
+  }
+  for (int m = 0; m < new_blocks_.size(); ++m) {
+    std::shared_ptr<BlockDesc> new_block = new_blocks_[m];
+    new_block->index_ = m + ori_des->blocks_.size();
+    optimize_program->blocks_.push_back(new_block);
+  }
+  return optimize_program;
+}
+void ProgramOptimize::GenerateOps(
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
+    Node *current_node, bool adding_thread, int thread_num,
+    std::shared_ptr<BlockDesc> new_block) {
+  if (current_node->outputs_.size() > 1) {
+    adding_thread = false;
+  }
+  bool can_add_split = false;
+  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
+  if (current_node->outputs_.size() > 1 &&
+      op_input_output_key[current_node->op_desc_->type_].second.size() == 1) {
+    can_add_split = true;
+    // 遍历当前节点的 output 节点
+    for (const auto &output : current_node->outputs_) {
+      // 不支持 output 有多个 output 的情况
+      if (output->outputs_.size() > 1) {
+        DLOG << "don't support multi output of output";
+        can_add_split = false;
+        break;
+      }
+      //与节点关联的 OpDesc
+      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
+      //获取这个 op 的 inputs key 和 outputs key
+      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
+      //判断现在 是否存在这个 op
+      //判断这个 output 和 input key 的 size 等于 1
+      if (op_input_output_key.find(op_desc->type_) !=
+              op_input_output_key.end() &&
+          inputs_and_outputs.first.size() == 1 &&
+          inputs_and_outputs.second.size() == 1) {
+        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
+        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
+        // 判断一下, 如果输入和输出没有同名, 是支持的
+        for (int i = 0; i < inputs_of_output.size(); ++i) {
+          std::string input_of_output = inputs_of_output[i];
+          for (int j = 0; j < outputs_of_output.size(); ++j) {
+            std::string output_of_output = outputs_of_output[j];
+            if (input_of_output == output_of_output) {
+              DLOG << "output的 output 包含 input" << input_of_output;
+              can_add_split = false;
+              break;
+            }
+          }
+        }
+      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
+        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
+        can_add_split = false;
+      }
+    }
+  }
+  if (current_node->inputs_.size() > 1 &&
+      input_node != current_node->inputs_.back()) {
+    return;
+  } else if (current_node->inputs_.size() > 1 &&
+             input_node == current_node->inputs_.back()) {
+    new_block.reset();
+    adding_thread = false;
+    op_desc->push_back(current_node->op_desc_);
+  } else {
+    if (new_block.get() && adding_thread) {
+      new_block->ops_.push_back(current_node->op_desc_);
+    } else {
+      op_desc->push_back(current_node->op_desc_);
+    }
+  }
+  if (adding_thread) {
+    Attribute attr;
+    attr.Set<int>(thread_num);
+    current_node->op_desc_->attrs_["thread"] = attr;
+  }
+  if (can_add_split) {
+    new_block = std::make_shared<BlockDesc>();
+    new_block->multi_thread_ = true;
+    new_block->index_ = current_block_;
+    new_blocks_.push_back(new_block);
+    adding_thread = true;
+    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
+    split_op_desc->type_ = G_OP_TYPE_SPLIT;
+    auto outputs = current_node->op_desc_->Output(
+        op_input_output_key[current_node->op_desc_->Type()].second[0]);
+    split_op_desc->inputs_ = {
+        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs =
+        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto &output : current_node->outputs_) {
+      split_outputs.push_back(outputs[0]);
+    }
+    Attribute attr;
+    attr.Set<int>(current_block_);
+    split_op_desc->attrs_["block_id"] = attr;
+    op_desc->push_back(split_op_desc);
+    current_block_++;
+  }
+  for (int i = 0; i < current_node->outputs_.size(); ++i) {
+    auto &output = current_node->outputs_[i];
+    if (can_add_split) {
+      GenerateOps(op_desc, current_node, output.get(), adding_thread, i,
+                  new_block);
+    } else {
+      GenerateOps(op_desc, current_node, output.get(), adding_thread,
+                  thread_num, new_block);
+    }
  }
-  std::shared_ptr<ProgramDesc> shared_optimzie(optimize_program);
-  return shared_optimzie;
 }
+void ProgramOptimize::GenerateOps(
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+    Node *begin_node) {
+  // std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+  //             Node *input_node, Node *current_node, bool adding_thread, int
+  //             thread_num
+  this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
+}
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -28,12 +28,17 @@ class ProgramOptimize {
 public:
  ProgramOptimize() {}
  std::shared_ptr<ProgramDesc> FushionOptimize(
-      std::shared_ptr<ProgramDesc> ori_des);
+      std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);
 private:
-  //                std::shared_ptr<ProgramDesc> ori_desc_;
+  int current_block_;
-  std::vector<std::unordered_map<std::string, std::shared_ptr<Node>>>
+  std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
-      outputs_nodes_;
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+                   Node *begin_node);
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+                   Node *input_node, Node *current_node, bool adding_thread,
+                   int thread_num, std::shared_ptr<BlockDesc> new_block);
 };
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/program/program_desc.cpp
+++ b/src/framework/program/program_desc.cpp
@@ -32,11 +32,13 @@ void ProgramDesc::Description(std::string header) {
  if (header.size()) {
    LOG(kLOG_INFO) << header;
  }
-  for (const auto &block : this->blocks_) {
+  for (int i = 0; i < this->blocks_.size(); ++i) {
+    auto block = this->blocks_[i];
    LOG(kLOG_DEBUG) << "block: " << block->ID();
    LOG(kLOG_INFO) << "block ops size: " << block->Ops().size();
    for (int j = 0; j < block->Ops().size(); ++j) {
-      const auto &op = block->Ops()[j];
+      auto op = block->Ops()[j];
      LOG(kLOG_DEBUG1) << "op: " << op->Type();
      for (auto &input : op->GetInputs()) {
        LOG(kLOG_DEBUG2) << "input parameter: " << input.first;
@@ -71,6 +73,9 @@ void ProgramDesc::Description(std::string header) {
      }
    }
  }
+  for (const auto &block : this->blocks_) {
+  }
 #endif
 }

--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -32,7 +32,7 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  std::vector<int> paddings = param.Paddings();
  std::vector<int> dilations = param.Dilations();
-  DLOG << " compute end get Attrs " << strides[0];
+  //  DLOG << " compute end get Attrs " << strides[0];
  const int batch_size = static_cast<int>(input->dims()[0]);
@@ -59,17 +59,17 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
-  DLOG << " col_shape = " << col_shape;
+  //  DLOG << " col_shape = " << col_shape;
-  DLOG << " col_matrix_shape = " << col_matrix_shape;
+  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
  framework::DDim input_shape = framework::slice_ddim(
      input->dims(), 1, static_cast<int>(input->dims().size()));
-  DLOG << " input_shape = " << input_shape;
+  //  DLOG << " input_shape = " << input_shape;
  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
  filter.Resize(filter_matrix_shape);
-  DLOG << " filter.dims() = " << filter.dims();
+  //  DLOG << " filter.dims() = " << filter.dims();
  framework::DDim output_matrix_shape = {
      output->dims()[1],
@@ -85,8 +85,8 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  for (int i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    DLOG << " in_batch.dims() = " << in_batch.dims();
+    //    DLOG << " in_batch.dims() = " << in_batch.dims();
-    DLOG << " out_batch.dims() = " << out_batch.dims();
+    //    DLOG << " out_batch.dims() = " << out_batch.dims();
    for (int g = 0; g < groups; g++) {
      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
@@ -109,9 +109,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      DLOG << " out_slice " << out_slice.dims();
+      //      DLOG << " out_slice " << out_slice.dims();
-      DLOG << " filter_slice " << filter_slice.dims();
+      //      DLOG << " filter_slice " << filter_slice.dims();
-      DLOG << " col_matrix " << col_matrix.dims();
+      //      DLOG << " col_matrix " << col_matrix.dims();
      math::matmul<float>(filter_slice, false, col_matrix, false,
                          static_cast<float>(1), &out_slice,
                          static_cast<float>(0));