add split, foramt codes

c9c6562e · liuruilong · 636fbf0e · c9c6562e · c9c6562e · c9c6562e
35 changed file
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -14,6 +14,10 @@ limitations under the License. */
 #pragma once;
+#include <string>
+#include <utility>
+#include <unordered_map>
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };
@@ -67,4 +71,41 @@ enum PMStatus {
  PMUnImplError = 0x07,    /*!< Unimplement error. */
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };
+static const std::string G_OP_TYPE_CONV = "conv2d";
+static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+static const std::string G_OP_TYPE_CONCAT = "concat";
+static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "FusionConvAddRelu";
+static const std::string G_OP_TYPE_FC = "fc";
+static const std::string G_OP_TYPE_LRN = "lrn";
+static const std::string G_OP_TYPE_MUL = "mul";
+static const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+static const std::string G_OP_TYPE_POOL2D = "pool2d";
+static const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+static const std::string G_OP_TYPE_RELU = "relu";
+static const std::string G_OP_TYPE_RESHAPE = "reshape";
+static const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+static const std::string G_OP_TYPE_SOFTMAX = "softmax";
+static const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+static const std::string G_OP_TYPE_SPLIT = "split";
+static const std::string G_OP_TYPE_FEED = "feed";
+static const std::string G_OP_TYPE_FETCH = "fetch";
+static std::unordered_map<
+        std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
+        op_input_output_key = {{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+                               {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+                               {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+                               {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -19,61 +19,64 @@ limitations under the License. */
 #include <utility>
 #include <vector>
-#include "common/enforce.h"
-#include "common/type_define.h"
 #include "common/types.h"
+#include "common/enforce.h"
 #include "common/variant.h"
-#include "framework/attribute.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
 #include "framework/op_info.h"
-#include "framework/op_kernel_type.h"
+#include "common/type_define.h"
+#include "framework/variable.h"
+#include "framework/attribute.h"
 #include "framework/op_registry.h"
-#include "framework/paddle_mobile_object.h"
+#include "framework/op_kernel_type.h"
 #include "framework/program/block_desc.h"
+#include "framework/paddle_mobile_object.h"
 #include "framework/program/program-optimize/node.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/variable.h"
 namespace paddle_mobile {
 namespace framework {
 using std::string;
 using std::vector;
-static std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {{"conv2d", {{"Input"}, {"Output"}}},
-                           {"relu", {{"X"}, {"Out"}}},
-                           {"softmax", {{"X"}, {"Out"}}},
-                           {"mul", {{"X"}, {"Out"}}},
-                           {"elementwise_add", {{"X", "Y"}, {"Out"}}},
-                           {"pool2d", {{"X"}, {"Out"}}},
-                           {"batch_norm", {{"X"}, {"Y"}}},
-                           {"lrn", {{"X"}, {"Out"}}},
-                           {"concat", {{"X"}, {"Out"}}},
-                           {"feed", {{"X"}, {"Out"}}},
-                           {"fetch", {{"X"}, {"Out"}}}};
 template <typename Dtype>
 class OperatorBase : PaddleMobileObject {
 public:
+  /*
+   *  @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor
+   * */
  OperatorBase(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs, const AttributeMap &attrs,
               std::shared_ptr<Scope> scope);
  virtual ~OperatorBase() {}
  void Run() const;
-  vector<string> GetOutKeys() const;
+  std::vector<string> GetOutKeys() const;
  virtual void RunImpl() const = 0;
-  virtual void InferShape() const = 0;
+  /*
+   * @b op 运算所需的输入, 如上一层的输出结果、卷积核
+   * */
  const VariableNameMap &Inputs() const { return inputs_; }
+  /*
+   * @b op 的输出, 内存会提前被分配好, 运算结果会被存到分配好的内存内
+   * */
  const VariableNameMap &Outputs() const { return outputs_; }
+  /*
+   * @b op 类型
+   * */
  const std::string &Type() const { return type_; }
+  /*
+   * @b op 运算所需要用到的参数: 如 conv 运算所需要用到的 stride
+   * */
  const AttributeMap &Attrs() const { return attrs_; }
  void ClearVariables(const std::vector<std::string> &var_names) const {
    if (this->scope_) {
      this->scope_->EraseVars(var_names);
    }
  }
+  /*
+   * @b 根据输入形状和参数计算出输出形状
+   * */
+  virtual void InferShape() const = 0;
 protected:
  std::shared_ptr<Scope> scope_;
  std::string type_;
@@ -85,6 +88,9 @@ class OperatorBase : PaddleMobileObject {
  void CheckAllInputOutputSet() const;
 };
+/*
+ * @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase
+ * */
 template <typename Dtype>
 class OperatorWithKernel : public OperatorBase<Dtype> {
 public:
@@ -97,11 +103,18 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
  virtual void InferShape() const = 0;
 };
+/*
+ * @b 所有kernel的父类
+ * */
 template <typename Dtype, typename P>
 class OpKernelBase : PaddleMobileObject {
 public:
+  /*
+   * @b 所有kernel 需实现 Compute 方法
+   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
+   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
+   * */
  virtual void Compute(const P &para) const = 0;
  virtual ~OpKernelBase() = default;
 };
@@ -118,8 +131,8 @@ class FusionOpMatcher : PaddleMobileObject {
  virtual std::string Type() = 0;
-  virtual void FolderNodes(Node &node) {
+  virtual void FolderNodes(Node *node) {
-    node.Folder(node_.Depth(), Type(), {});
+    node->Folder(node_.Depth(), Type(), {});
  }
  virtual Node &BeginNode() { return node_; }

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <sstream>
+#include "framework/operator.h"
 #include "framework/program/program-optimize/node.h"
 namespace paddle_mobile {
@@ -73,24 +74,79 @@ void Node::OpDescs(uint index,
 }
 void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *node) {
+                   Node *node, bool adding_thread, int thread_num) {
-  auto iter = std::find(op_desc->begin(), op_desc->end(), this->op_desc_);
+  bool can_add_split = false;
+  if (outputs_.size() > 1) {
+    can_add_split = true;
+    if (op_input_output_key[op_desc_->type_].second.size() != 1) {
+      DLOG << "当前 op desc 输出数不为 1 ";
+      can_add_split = false;
+    }
+    for (const auto& output : outputs_) {
+      if (op_input_output_key.find(output->op_desc_->type_) != op_input_output_key.end()) {
+        auto inputs_and_outputs = op_input_output_key[output->op_desc_->type_];
+        auto outputs_of_output = output->op_desc_->Output(inputs_and_outputs.second[0]);
+        auto inputs_of_output = output->op_desc_->Input(inputs_and_outputs.first[0]);
+        for (int i = 0; i < inputs_of_output.size(); ++i) {
+          std::string input_of_output = inputs_of_output[i];
+          for (int j = 0; j < outputs_of_output.size(); ++j) {
+            std::string output_of_output = outputs_of_output[j];
+            if (input_of_output == output_of_output) {
+              DLOG << "output的 output 包含 input" << input_of_output;
+              can_add_split = false;
+              break;
+            }
+          }
+        }
+      } else {
+        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
+        can_add_split = false;
+      }
+    }
+  }
  if (inputs_.size() > 1 && node != inputs_.back()) {
    return;
  } else if (inputs_.size() > 1 && node == inputs_.back()) {
+    adding_thread = false;
    op_desc->push_back(this->op_desc_);
  } else {
    op_desc->push_back(this->op_desc_);
  }
+  if (adding_thread) {
+    Attribute attr;
+    attr.Set<int>(thread_num);
+    this->op_desc_->attrs_["thread"] = attr;
+  }
-  for (auto &output : outputs_) {
+  if (can_add_split) {
-    output->OpDescs(op_desc, this);
+    adding_thread = true;
+    std::shared_ptr<class OpDesc> split_op_desc = std::make_shared<class OpDesc>();
+    split_op_desc->type_ = G_OP_TYPE_SPLIT;
+    auto outputs = this->op_desc_->Output(op_input_output_key[this->op_desc_->Type()].second[0]);
+    split_op_desc->inputs_ = {{op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs = split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto& output : outputs_) {
+      split_outputs.push_back(outputs[0]);
+    }
+    DLOG << "add split";
+    op_desc->push_back(split_op_desc);
+  }
+  for (int i = 0; i < outputs_.size(); ++i) {
+    auto &output = outputs_[i];
+    if (can_add_split) {
+      output->OpDescs(op_desc, this, adding_thread, i);
+    } else {
+      output->OpDescs(op_desc, this, adding_thread, thread_num);
+    }
  }
 }
 std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(&op_descs, this);
+  OpDescs(&op_descs, this, false, 0);
  return op_descs;
 }

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -42,13 +42,13 @@ class Node : PaddleMobileObject {
      std::map<std::string, std::pair<std::string, std::string>> change_map);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
-  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-               Node *node);
  std::shared_ptr<framework::OpDesc> OpDesc() { return op_desc_; }
  std::string BeginType() { return type_; }
  void Description();
 private:
+  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+               Node *node, bool adding_thread, int thread_num);
  void OpDescs(uint size,
               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
  void To(int index, std::shared_ptr<Node>);

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -19,7 +19,7 @@ namespace paddle_mobile {
 namespace framework {
-std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
+//std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
    std::shared_ptr<ProgramDesc> ori_des) {
@@ -86,7 +86,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
          //          DLOG << " match success " << " fusion node: \n" <<
          //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
          //          DLOG << "match node\n"<< *match_node;
-          matcher->FolderNodes(*match_node);
+          matcher->FolderNodes(match_node.get());
          //          DLOG << " after match node\n"<< *match_node;
          //          match_node->Description();

--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -27,7 +27,6 @@ namespace framework {
 class ProgramOptimize {
 public:
  ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> Optimize();
  std::shared_ptr<ProgramDesc> FushionOptimize(
      std::shared_ptr<ProgramDesc> ori_des);

--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@@ -15,15 +15,18 @@ limitations under the License. */
 #include "io.h"
 #include <fstream>
 #include <vector>
-#include "common/enforce.h"
 #include "common/log.h"
-#include "framework/framework.pb-c.h"
-#include "framework/lod_tensor.h"
+#include "common/enforce.h"
-#include "framework/operator.h"
+#include "common/enforce.h"
-#include "framework/program/program_desc.h"
-#include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#include "framework/operator.h"
+#include "framework/lod_tensor.h"
+#include "framework/framework.pb-c.h"
+#include "framework/program/var_desc.h"
+#include "framework/program/program_desc.h"
+#include "framework/program/program-optimize/program_optimize.h"
 namespace paddle_mobile {
 using framework::Variable;
@@ -166,7 +169,7 @@ void Loader<Dtype, P>::LoadVar(framework::Variable *variable,
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname) {
+    const std::string &dirname, bool optimize) {
  std::string model_filename = dirname + "/__model__";
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
@@ -199,11 +202,11 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
      //      DLOG << "var name-- " << var_desc->Name();
      auto var = scope->Var(var_desc->Name());
      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
        if (var_desc->Persistable() &&
            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          //          DLOG << "to load var ";
          auto dim = var_desc->Tensor_desc().Dims();
          auto tensor = var->GetMutable<framework::LoDTensor>();
          tensor->Resize(framework::make_ddim(dim));
@@ -219,8 +222,12 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
      }
    }
  }
+  //  originProgramDesc->Description("program: ");
-  originProgramDesc->Description("program: ");
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram = program_optimize.FushionOptimize(originProgramDesc);
+  }
  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
  return program;
@@ -231,33 +238,8 @@ template class Loader<CPU, Precision::FP32>;
 #pragma mark - executor
 template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p) : program_(p) {
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, bool use_optimize)
-  if (use_optimize_) {
+    : program_(p), batch_size_(batch_size), use_optimize_(use_optimize) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      op_base->InferShape();
-      ops_of_block_[*block_desc.get()].push_back(op_base);
-    }
-  }
-  InitMemory();
-}
-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size)
-    : program_(p), batch_size_(batch_size) {
  if (use_optimize_) {
    to_predict_program_ = program_.optimizeProgram;
  } else {
@@ -389,7 +371,7 @@ void Executor<Dtype, P>::InitMemory() {
 }
 template <typename Dtype, Precision P>
-void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
+void Executor<Dtype, P>::Predict(const framework::Tensor &t, int block_id) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
@@ -404,11 +386,11 @@ void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
 }
 template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
+std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
-  predict(tensor, 0);
+  Predict(tensor, 0);
  framework::Variable *g_feed_value = program_.scope->Var("col");
  auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();

--- a/src/common/io.h
+++ b/src/common/io.h
@@ -30,7 +30,7 @@ namespace paddle_mobile {
 template <typename Dtype, Precision P = Precision::FP32>
 class Loader : PaddleMobileObject {
 public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname);
+  const framework::Program<Dtype, P> Load(const std::string &dirname, bool optimize = true);
 private:
  void LoadVar(framework::Variable *variable,
@@ -45,13 +45,11 @@ class Executor {
  Executor() = default;
-  Executor(const framework::Program<Dtype> p);
+  Executor(const framework::Program<Dtype> p, int batch_size = 1, bool use_optimize = true);
-  Executor(const framework::Program<Dtype> p, int batch_size);
+  //  std::shared_ptr<framework::Tensor> Predict(framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> predict(framework::Tensor &t);
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
-  std::vector<Ptype> predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);
 protected:
@@ -61,7 +59,7 @@ class Executor {
  framework::Program<Dtype> program_;
  int batch_size_ = 1;
  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  void predict(const framework::Tensor &t, int block_id);
+  void Predict(const framework::Tensor &t, int block_id);
  std::map<framework::BlockDesc,
           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
      ops_of_block_;

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -23,18 +23,17 @@ namespace operators {
 class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
 public:
  FushionConvAddReluOpMatcher() {
-    node_ = framework::Node("conv2d");
+    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>("elementwise_add") >
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>("relu");
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
  }
  void FolderNodes(framework::Node &node) {
    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
  }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
-  std::string Type() { return "FusionConvAddRelu"; }
 };
 class FusionFcOp {

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -28,17 +28,17 @@ using std::vector;
 class FusionFcMatcher : public framework::FusionOpMatcher {
 public:
  FusionFcMatcher() {
-    node_ = framework::Node("mul");
+    node_ = framework::Node(G_OP_TYPE_MUL);
-    node_ > std::make_shared<framework::Node>("elementwise_add");
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
  }
  void FolderNodes(framework::Node &node) {
    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
  }
-  std::string Type() { return "fc"; }
+  std::string Type() { return G_OP_TYPE_FC; }
 };
 template <typename DeviceType, typename T>

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -20,11 +20,15 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <typename T>
 struct ReluFunctor {
  inline T operator()(T in) const { return in > 0 ? in : 0; }
 };
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  const auto *input_x = param.InputX();

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -696,6 +696,9 @@ class ReshapeParam : public OpParam {
  bool inplace_;
 };
+/*
+ * @b op 层实例化好这个 param 传递给 kernel 层使用
+ * */
 class ReluParam : public OpParam {
 public:
  ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -725,7 +728,6 @@ class FushionFcParam : public OpParam {
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);
  }
  const Tensor *InputX() const { return input_x_; }
  const Tensor *InputY() const { return input_y_; }

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -25,6 +25,10 @@ template class ReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+/*
+ * @b 每一个 op 都需要注册一下的,
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数 都是需要和model中类型对应起来的
+ * */
 namespace ops = paddle_mobile::operators;
 USE_OP(relu);
 REGISTER_OPERATOR(relu, ops::ReluOp);
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -28,6 +28,9 @@ using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
 class ReluOp : public framework::OperatorWithKernel<DeviceType> {
 public:
+  /*
+   * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
+   * */
  ReluOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
         std::shared_ptr<framework::Scope> scope)
@@ -35,6 +38,9 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}
+   /*
+   * @b op 进行运算, 调用相应的 kernel 进行运算
+   * */
  void RunImpl() const {
    operators::ReluKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
@@ -44,6 +50,10 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
  void InferShape() const override;
 protected:
+  /*
+   * @b Relu kernel 进行运算时所需要用到参数的结构体,
+   *    结构体定义在: paddle-mobile/src/operators/op_param.h
+   * */
  ReluParam param_;
 };

--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include "common/io.h"
+#include "io.h"
 #include "common/log.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
@@ -77,7 +77,7 @@ class Executor4Test : public Executor<DeviceType> {
  }
  template <typename T = LoDTensor>
-  vector<std::shared_ptr<Tensor>> predict(const vector<Tensor> &ts,
+  vector<std::shared_ptr<Tensor>> Predict(const vector<Tensor> &ts,
                                          const vector<string> &input_names,
                                          const vector<string> &output_names,
                                          const vector<DDim> &ddims) {
@@ -116,7 +116,7 @@ class Executor4Test : public Executor<DeviceType> {
    return output_tensor_sptrs;
  }
-  std::shared_ptr<Tensor> predict(const Tensor &t, string input, string output,
+  std::shared_ptr<Tensor> Predict(const Tensor &t, string input, string output,
                                  const DDim &dDim) {
    auto scope = this->program_.scope;
    Variable *g_feed_value = scope->Var(input);

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "common/io.h"
+#include "io.h"
+#include "../test_helper.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(g_googlenet);
+  program.optimizeProgram->Description("program desc: ");
  return 0;
 }
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "common/io.h"
+#include "io.h"
+#include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;
-  //  program.originProgram->Description("origin");
+//  program.originProgram->Description("origin");
  auto optimize_program = optimize.FushionOptimize(program.originProgram);
  if (optimize_program != nullptr) {
    optimize_program->Description("optimize");

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -21,16 +21,16 @@ int main() {
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto time1 = time();
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(g_googlenet, false);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
  std::vector<float> input;
  std::vector<int64_t> dims{1, 3, 224, 224};
  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
  auto time3 = time();
-  executor.predict(input, dims);
+  executor.Predict(input, dims);
  auto time4 = time();
  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
  return 0;

--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -129,7 +129,7 @@ int main() {
  DLOG << "begin to run BatchNormOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(
-      "../../test/models/image_classification_resnet.inference.model"));
+          g_resnet));
  /// input x (4,10,2,2)
  paddle_mobile::framework::Tensor inputx1;

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -116,7 +116,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BoxCoderOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  paddle_mobile::framework::Tensor priorbox;
  SetupTensor<float>(&priorbox, {1917, 4}, static_cast<float>(0),

--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -57,7 +57,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -34,7 +34,7 @@ int main() {
  //                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
-  auto output = executor.predict(input, "data", "conv2d_0.tmp_0", out_ddim);
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
  auto output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {

--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -50,7 +50,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fushion_fc_op.cpp
@@ -116,7 +116,7 @@ int main() {
  DLOG << "begin to run Fc Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;
  //  program.originProgram->Description("origin");
  auto optimize_program = optimize.FushionOptimize(program.originProgram);

--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -46,7 +46,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -50,7 +50,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(std::string(g_googlenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -32,7 +32,7 @@ int main() {
                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
  auto output =
-      executor.predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
  float *output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {

--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -127,7 +127,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run PriorBoxOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  /// input x (1,3,300,300)
  paddle_mobile::framework::Tensor input_image;

--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -46,7 +46,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -31,7 +31,7 @@ int main() {
  auto input_ptr = input.data<float>();
  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
  auto output =
-      executor.predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();
  DLOG << "input : ";

--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::framework::Tensor input;

--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../models/mobilenet"));
+  auto program = loader.Load(std::string(g_mobilenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -30,7 +30,7 @@ int main() {
                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000});
  auto output =
-      executor.predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
+      executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {
    DLOG << " value of output: " << output_ptr[j];

--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -31,7 +31,7 @@ int main() {
  auto input_ptr = input.data<float>();
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2});
  auto output =
-      executor.predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();
  DLOG << "input : ";

--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <chrono>
-#include <fstream>
 #include <random>
+#include <fstream>
 #include "common/log.h"
 #include "framework/ddim.h"

--- a/test/test_include.h
+++ b/test/test_include.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "./test_helper.h"
 #include "common/enforce.h"
-#include "common/io.h"
+#include "io.h"
 #include "common/log.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"