conflict

410e5bdf · eclipsess · 3e0e0705 · c4cf0c74 · 410e5bdf · 410e5bdf
47 changed file
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -14,6 +14,10 @@ limitations under the License. */
 #pragma once;
+#include <string>
+#include <unordered_map>
+#include <utility>
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };
@@ -67,4 +71,41 @@ enum PMStatus {
  PMUnImplError = 0x07,    /*!< Unimplement error. */
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };
+static const std::string G_OP_TYPE_CONV = "conv2d";
+static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+static const std::string G_OP_TYPE_CONCAT = "concat";
+static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "FusionConvAddRelu";
+static const std::string G_OP_TYPE_FC = "fc";
+static const std::string G_OP_TYPE_LRN = "lrn";
+static const std::string G_OP_TYPE_MUL = "mul";
+static const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+static const std::string G_OP_TYPE_POOL2D = "pool2d";
+static const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+static const std::string G_OP_TYPE_RELU = "relu";
+static const std::string G_OP_TYPE_RESHAPE = "reshape";
+static const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+static const std::string G_OP_TYPE_SOFTMAX = "softmax";
+static const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+static const std::string G_OP_TYPE_SPLIT = "split";
+static const std::string G_OP_TYPE_FEED = "feed";
+static const std::string G_OP_TYPE_FETCH = "fetch";
+static std::unordered_map<
+    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
+    op_input_output_key = {{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+                           {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+                           {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+                           {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -23,6 +23,7 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
  auto it = op_input_output_key.find(type_);
  if (it == op_input_output_key.end()) {
    DLOG << type_ << " has no outputs";
+    return {};
  }
  return it->second.second;
 }

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -38,42 +38,46 @@ namespace paddle_mobile {
 namespace framework {
 using std::string;
 using std::vector;
-static std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {{"conv2d", {{"Input"}, {"Output"}}},
-                           {"relu", {{"X"}, {"Out"}}},
-                           {"softmax", {{"X"}, {"Out"}}},
-                           {"mul", {{"X"}, {"Out"}}},
-                           {"elementwise_add", {{"X", "Y"}, {"Out"}}},
-                           {"pool2d", {{"X"}, {"Out"}}},
-                           {"batch_norm", {{"X"}, {"Y"}}},
-                           {"lrn", {{"X"}, {"Out"}}},
-                           {"concat", {{"X"}, {"Out"}}},
-                           {"feed", {{"X"}, {"Out"}}},
-                           {"fetch", {{"X"}, {"Out"}}},
-                           {"reshape", {{"X"}, {"Out"}}}};
 template <typename Dtype>
 class OperatorBase : PaddleMobileObject {
 public:
+  /*
+   *  @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor
+   * */
  OperatorBase(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs, const AttributeMap &attrs,
               std::shared_ptr<Scope> scope);
  virtual ~OperatorBase() {}
  void Run() const;
-  vector<string> GetOutKeys() const;
+  std::vector<string> GetOutKeys() const;
  virtual void RunImpl() const = 0;
-  virtual void InferShape() const = 0;
+  /*
+   * @b op 运算所需的输入, 如上一层的输出结果、卷积核
+   * */
  const VariableNameMap &Inputs() const { return inputs_; }
+  /*
+   * @b op 的输出, 内存会提前被分配好, 运算结果会被存到分配好的内存内
+   * */
  const VariableNameMap &Outputs() const { return outputs_; }
+  /*
+   * @b op 类型
+   * */
  const std::string &Type() const { return type_; }
+  /*
+   * @b op 运算所需要用到的参数: 如 conv 运算所需要用到的 stride
+   * */
  const AttributeMap &Attrs() const { return attrs_; }
  void ClearVariables(const std::vector<std::string> &var_names) const {
    if (this->scope_) {
      this->scope_->EraseVars(var_names);
    }
  }
+  /*
+   * @b 根据输入形状和参数计算出输出形状
+   * */
+  virtual void InferShape() const = 0;
 protected:
  std::shared_ptr<Scope> scope_;
@@ -86,6 +90,9 @@ class OperatorBase : PaddleMobileObject {
  void CheckAllInputOutputSet() const;
 };
+/*
+ * @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase
+ * */
 template <typename Dtype>
 class OperatorWithKernel : public OperatorBase<Dtype> {
 public:
@@ -98,11 +105,18 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
  virtual void InferShape() const = 0;
 };
+/*
+ * @b 所有kernel的父类
+ * */
 template <typename Dtype, typename P>
 class OpKernelBase : PaddleMobileObject {
 public:
+  /*
+   * @b 所有kernel 需实现 Compute 方法
+   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
+   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
+   * */
  virtual void Compute(const P &para) const = 0;
  virtual ~OpKernelBase() = default;
 };
@@ -119,8 +133,8 @@ class FusionOpMatcher : PaddleMobileObject {
  virtual std::string Type() = 0;
-  virtual void FolderNodes(Node &node) {
+  virtual void FolderNodes(Node *node) {
-    node.Folder(node_.Depth(), Type(), {});
+    node->Folder(node_.Depth(), Type(), {});
  }
  virtual Node &BeginNode() { return node_; }

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <sstream>
+#include "framework/operator.h"
 #include "framework/program/program-optimize/node.h"
 namespace paddle_mobile {
@@ -73,24 +74,86 @@ void Node::OpDescs(uint index,
 }
 void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *node) {
+                   Node *node, bool adding_thread, int thread_num) {
-  auto iter = std::find(op_desc->begin(), op_desc->end(), this->op_desc_);
+  bool can_add_split = false;
+  if (outputs_.size() > 1) {
+    can_add_split = true;
+    if (op_input_output_key[op_desc_->type_].second.size() != 1) {
+      DLOG << "当前 op desc 输出数不为 1 ";
+      can_add_split = false;
+    }
+    for (const auto &output : outputs_) {
+      if (op_input_output_key.find(output->op_desc_->type_) !=
+          op_input_output_key.end()) {
+        auto inputs_and_outputs = op_input_output_key[output->op_desc_->type_];
+        auto outputs_of_output =
+            output->op_desc_->Output(inputs_and_outputs.second[0]);
+        auto inputs_of_output =
+            output->op_desc_->Input(inputs_and_outputs.first[0]);
+        for (int i = 0; i < inputs_of_output.size(); ++i) {
+          std::string input_of_output = inputs_of_output[i];
+          for (int j = 0; j < outputs_of_output.size(); ++j) {
+            std::string output_of_output = outputs_of_output[j];
+            if (input_of_output == output_of_output) {
+              DLOG << "output的 output 包含 input" << input_of_output;
+              can_add_split = false;
+              break;
+            }
+          }
+        }
+      } else {
+        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
+        can_add_split = false;
+      }
+    }
+  }
  if (inputs_.size() > 1 && node != inputs_.back()) {
    return;
  } else if (inputs_.size() > 1 && node == inputs_.back()) {
+    adding_thread = false;
    op_desc->push_back(this->op_desc_);
  } else {
    op_desc->push_back(this->op_desc_);
  }
+  if (adding_thread) {
+    Attribute attr;
+    attr.Set<int>(thread_num);
+    this->op_desc_->attrs_["thread"] = attr;
+  }
-  for (auto &output : outputs_) {
+  if (can_add_split) {
-    output->OpDescs(op_desc, this);
+    adding_thread = true;
+    std::shared_ptr<class OpDesc> split_op_desc =
+        std::make_shared<class OpDesc>();
+    split_op_desc->type_ = G_OP_TYPE_SPLIT;
+    auto outputs = this->op_desc_->Output(
+        op_input_output_key[this->op_desc_->Type()].second[0]);
+    split_op_desc->inputs_ = {
+        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs =
+        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto &output : outputs_) {
+      split_outputs.push_back(outputs[0]);
+    }
+    DLOG << "add split";
+    op_desc->push_back(split_op_desc);
+  }
+  for (int i = 0; i < outputs_.size(); ++i) {
+    auto &output = outputs_[i];
+    if (can_add_split) {
+      output->OpDescs(op_desc, this, adding_thread, i);
+    } else {
+      output->OpDescs(op_desc, this, adding_thread, thread_num);
+    }
  }
 }
 std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(&op_descs, this);
+  OpDescs(&op_descs, this, false, 0);
  return op_descs;
 }

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -42,13 +42,13 @@ class Node : PaddleMobileObject {
      std::map<std::string, std::pair<std::string, std::string>> change_map);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
-  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-               Node *node);
  std::shared_ptr<framework::OpDesc> OpDesc() { return op_desc_; }
  std::string BeginType() { return type_; }
  void Description();
 private:
+  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+               Node *node, bool adding_thread, int thread_num);
  void OpDescs(uint size,
               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
  void To(int index, std::shared_ptr<Node>);

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -19,7 +19,7 @@ namespace paddle_mobile {
 namespace framework {
-std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
+// std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
    std::shared_ptr<ProgramDesc> ori_des) {
@@ -86,7 +86,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
          //          DLOG << " match success " << " fusion node: \n" <<
          //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
          //          DLOG << "match node\n"<< *match_node;
-          matcher->FolderNodes(*match_node);
+          matcher->FolderNodes(match_node.get());
          //          DLOG << " after match node\n"<< *match_node;
          //          match_node->Description();

--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -27,7 +27,6 @@ namespace framework {
 class ProgramOptimize {
 public:
  ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> Optimize();
  std::shared_ptr<ProgramDesc> FushionOptimize(
      std::shared_ptr<ProgramDesc> ori_des);

--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@@ -15,11 +15,13 @@ limitations under the License. */
 #include "io.h"
 #include <fstream>
 #include <vector>
-#include "common/enforce.h"
 #include "common/log.h"
+#include "common/enforce.h"
 #include "framework/framework.pb-c.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
+#include "framework/program/program-optimize/program_optimize.h"
 #include "framework/program/program_desc.h"
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
@@ -166,7 +168,7 @@ void Loader<Dtype, P>::LoadVar(framework::Variable *variable,
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname) {
+    const std::string &dirname, bool optimize) {
  std::string model_filename = dirname + "/__model__";
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
@@ -203,7 +205,6 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
        if (var_desc->Persistable() &&
            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          //          DLOG << "to load var ";
          auto dim = var_desc->Tensor_desc().Dims();
          auto tensor = var->GetMutable<framework::LoDTensor>();
          tensor->Resize(framework::make_ddim(dim));
@@ -219,8 +220,13 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
      }
    }
  }
+  //  originProgramDesc->Description("program: ");
-  originProgramDesc->Description("program: ");
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FushionOptimize(originProgramDesc);
+  }
  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
  return program;
@@ -231,33 +237,9 @@ template class Loader<CPU, Precision::FP32>;
 #pragma mark - executor
 template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p) : program_(p) {
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
-  if (use_optimize_) {
+                             bool use_optimize)
-    to_predict_program_ = program_.optimizeProgram;
+    : program_(p), batch_size_(batch_size), use_optimize_(use_optimize) {
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      op_base->InferShape();
-      ops_of_block_[*block_desc.get()].push_back(op_base);
-    }
-  }
-  InitMemory();
-}
-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size)
-    : program_(p), batch_size_(batch_size) {
  if (use_optimize_) {
    to_predict_program_ = program_.optimizeProgram;
  } else {
@@ -389,7 +371,7 @@ void Executor<Dtype, P>::InitMemory() {
 }
 template <typename Dtype, Precision P>
-void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
+void Executor<Dtype, P>::Predict(const framework::Tensor &t, int block_id) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
@@ -404,11 +386,11 @@ void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
 }
 template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
+std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
-  predict(tensor, 0);
+  Predict(tensor, 0);
  framework::Variable *g_feed_value = program_.scope->Var("col");
  auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();

--- a/src/common/io.h
+++ b/src/common/io.h
@@ -30,7 +30,8 @@ namespace paddle_mobile {
 template <typename Dtype, Precision P = Precision::FP32>
 class Loader : PaddleMobileObject {
 public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname);
+  const framework::Program<Dtype, P> Load(const std::string &dirname,
+                                          bool optimize = true);
 private:
  void LoadVar(framework::Variable *variable,
@@ -45,13 +46,12 @@ class Executor {
  Executor() = default;
-  Executor(const framework::Program<Dtype> p);
+  Executor(const framework::Program<Dtype> p, int batch_size = 1,
+           bool use_optimize = true);
-  Executor(const framework::Program<Dtype> p, int batch_size);
+  //  std::shared_ptr<framework::Tensor> Predict(framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> predict(framework::Tensor &t);
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
-  std::vector<Ptype> predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);
 protected:
@@ -61,7 +61,7 @@ class Executor {
  framework::Program<Dtype> program_;
  int batch_size_ = 1;
  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  void predict(const framework::Tensor &t, int block_id);
+  void Predict(const framework::Tensor &t, int block_id);
  std::map<framework::BlockDesc,
           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
      ops_of_block_;

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -21,13 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
 template <typename Dtype, typename T>
 void ConvOp<Dtype, T>::InferShape() const {
  //  std::cout << " begin get dims: " << std::endl;

--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -44,5 +44,12 @@ class ConvOp : public framework::OperatorWithKernel<DeviceType> {
  ConvParam param_;
 };
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/depthwise_conv_op.h"
+#include <vector>
+#include "framework/data_type.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
+#include "operators/conv_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void DepthwiseConvOp<Dtype, T>::InferShape() const {
+  auto in_dims = param_.Input()->dims();
+  auto filter_dims = param_.Filter()->dims();
+  const std::vector<int> &strides = param_.Strides();
+  std::vector<int> paddings = param_.Paddings();
+  int groups = param_.Groups();
+  std::vector<int> dilations = param_.Dilations();
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  param_.Output()->Resize(ddim);
+}
+template class DepthwiseConvOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+USE_OP(depthwise_conv2d);
+REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp);
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/depthwise_conv_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class DepthwiseConvOp : public framework::OperatorWithKernel<DeviceType> {
+ public:
+  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+                                                  scope),
+        param_(inputs, outputs, attrs, *scope) {}
+  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  void InferShape() const override;
+  void RunImpl() const {
+    operators::DepthwiseConvKernel<DeviceType, T> kernel;
+    kernel.Compute(param_);
+    this->ClearVariables({"Filter", "Input"});
+  }
+ private:
+  ConvParam param_;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -23,18 +23,18 @@ namespace operators {
 class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
 public:
  FushionConvAddReluOpMatcher() {
-    node_ = framework::Node("conv2d");
+    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>("elementwise_add") >
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>("relu");
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
  }
  void FolderNodes(framework::Node &node) {
    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(),
+                {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
  }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
-  std::string Type() { return "FusionConvAddRelu"; }
 };
 class FusionFcOp {

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -28,17 +28,18 @@ using std::vector;
 class FusionFcMatcher : public framework::FusionOpMatcher {
 public:
  FusionFcMatcher() {
-    node_ = framework::Node("mul");
+    node_ = framework::Node(G_OP_TYPE_MUL);
-    node_ > std::make_shared<framework::Node>("elementwise_add");
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
  }
  void FolderNodes(framework::Node &node) {
    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(),
+                {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
  }
-  std::string Type() { return "fc"; }
+  std::string Type() { return G_OP_TYPE_FC; }
 };
 template <typename DeviceType, typename T>

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -17,19 +17,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  LOG(kLOG_DEBUG) << param;

--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/depthwise_conv_kernel.h"
+#include "operators/kernel/conv_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  LOG(kLOG_DEBUG) << param;
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  DLOG << " compute end get Attrs " << strides[0];
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  DLOG << " col_shape = " << col_shape;
+  DLOG << " col_matrix_shape = " << col_matrix_shape;
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  DLOG << " input_shape = " << input_shape;
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  DLOG << " filter.dims() = " << filter.dims();
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    DLOG << " in_batch.dims() = " << in_batch.dims();
+    DLOG << " out_batch.dims() = " << out_batch.dims();
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      DLOG << " out_slice " << out_slice.dims();
+      DLOG << " filter_slice " << filter_slice.dims();
+      DLOG << " col_matrix " << col_matrix.dims();
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+      auto filter_ptr = filter_slice.data<float>();
+    }
+  }
+}
+template class DepthwiseConvKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -25,6 +25,9 @@ struct ReluFunctor {
  inline T operator()(T in) const { return in > 0 ? in : 0; }
 };
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  const auto *input_x = param.InputX();

--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <vector>
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
@@ -23,12 +24,28 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
+using framework::OpKernelBase;
 template <typename DeviceType, typename T>
-class ConvKernel : public framework::OpKernelBase<DeviceType, ConvParam> {
+class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
 public:
  void Compute(const ConvParam &param) const;
 };
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+#pragma once;
+namespace paddle_mobile {
+namespace operators {
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
+ public:
+  void Compute(const ConvParam &param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -696,6 +696,9 @@ class ReshapeParam : public OpParam {
  bool inplace_;
 };
+/*
+ * @b op 层实例化好这个 param 传递给 kernel 层使用
+ * */
 class ReluParam : public OpParam {
 public:
  ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -725,7 +728,6 @@ class FushionFcParam : public OpParam {
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);
  }
  const Tensor *InputX() const { return input_x_; }
  const Tensor *InputY() const { return input_y_; }

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -25,6 +25,11 @@ template class ReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+/*
+ * @b 每一个 op 都需要注册一下的,
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
+ * 都是需要和model中类型对应起来的
+ * */
 namespace ops = paddle_mobile::operators;
 USE_OP(relu);
 REGISTER_OPERATOR(relu, ops::ReluOp);
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -28,6 +28,9 @@ using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
 class ReluOp : public framework::OperatorWithKernel<DeviceType> {
 public:
+  /*
+   * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
+   * */
  ReluOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
         std::shared_ptr<framework::Scope> scope)
@@ -35,6 +38,9 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}
+  /*
+   * @b op 进行运算, 调用相应的 kernel 进行运算
+   * */
  void RunImpl() const {
    operators::ReluKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
@@ -44,6 +50,10 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
  void InferShape() const override;
 protected:
+  /*
+   * @b Relu kernel 进行运算时所需要用到参数的结构体,
+   *    结构体定义在: paddle-mobile/src/operators/op_param.h
+   * */
  ReluParam param_;
 };

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -99,3 +99,7 @@ target_link_libraries(test-mobilenet paddle-mobile)
 # gen test
 ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
 target_link_libraries(test-sigmoid paddle-mobile)
+# gen test
+ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
+target_link_libraries(test-depthwise-conv-op paddle-mobile)
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include "common/io.h"
 #include "common/log.h"
 #include "framework/op_registry.h"
+#include "io.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
@@ -73,10 +73,11 @@ class Executor4Test : public Executor<DeviceType> {
        }
      }
    }
+    this->InitMemory();
  }
  template <typename T = LoDTensor>
-  vector<std::shared_ptr<Tensor>> predict(const vector<Tensor> &ts,
+  vector<std::shared_ptr<Tensor>> Predict(const vector<Tensor> &ts,
                                          const vector<string> &input_names,
                                          const vector<string> &output_names,
                                          const vector<DDim> &ddims) {
@@ -115,7 +116,7 @@ class Executor4Test : public Executor<DeviceType> {
    return output_tensor_sptrs;
  }
-  std::shared_ptr<Tensor> predict(const Tensor &t, string input, string output,
+  std::shared_ptr<Tensor> Predict(const Tensor &t, string input, string output,
                                  const DDim &dDim) {
    auto scope = this->program_.scope;
    Variable *g_feed_value = scope->Var(input);

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "common/io.h"
+#include "../test_helper.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(g_googlenet);
+  program.optimizeProgram->Description("program desc: ");
  return 0;
 }
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "common/io.h"
+#include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;
  //  program.originProgram->Description("origin");
  auto optimize_program = optimize.FushionOptimize(program.originProgram);

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -21,16 +21,16 @@ int main() {
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto time1 = time();
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(g_googlenet, false);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
  std::vector<float> input;
  std::vector<int64_t> dims{1, 3, 224, 224};
  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
  auto time3 = time();
-  executor.predict(input, dims);
+  executor.Predict(input, dims);
  auto time4 = time();
  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
  return 0;

--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -19,10 +19,10 @@ limitations under the License. */
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  auto time1 = time();
-  auto program = loader.Load(g_mobilenet);
+  auto program = loader.Load(g_mobilenet, false);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
  std::vector<int64_t> dims{1, 3, 224, 224};
  Tensor input_tensor;
@@ -32,7 +32,7 @@ int main() {
  std::vector<float> input(input_tensor.data<float>(),
                           input_tensor.data<float>() + input_tensor.numel());
  auto time3 = time();
-  executor.predict(input, dims);
+  executor.Predict(input, dims);
  auto time4 = time();
  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
  return 0;

--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -21,10 +21,10 @@ int main() {
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto time1 = time();
-  auto program = loader.Load(g_yolo);
+  auto program = loader.Load(g_yolo, false);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
  std::vector<int64_t> dims{1, 3, 227, 227};
  Tensor input_tensor;
@@ -34,7 +34,7 @@ int main() {
  std::vector<float> input(input_tensor.data<float>(),
                           input_tensor.data<float>() + input_tensor.numel());
  auto time3 = time();
-  executor.predict(input, dims);
+  executor.Predict(input, dims);
  auto time4 = time();
  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
  return 0;

--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -128,8 +128,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BatchNormOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(
+  auto program = loader.Load(std::string(g_resnet));
-      "../../test/models/image_classification_resnet.inference.model"));
  /// input x (4,10,2,2)
  paddle_mobile::framework::Tensor inputx1;

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -116,7 +116,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BoxCoderOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  paddle_mobile::framework::Tensor priorbox;
  SetupTensor<float>(&priorbox, {1917, 4}, static_cast<float>(0),

--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -57,7 +57,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -34,7 +34,7 @@ int main() {
  //                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
-  auto output = executor.predict(input, "data", "conv2d_0.tmp_0", out_ddim);
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
  auto output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {

--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/depthwise_conv_op.h"
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet_ssd);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::DepthwiseConvOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "depthwise_conv2d");
+  paddle_mobile::framework::LoDTensor input;
+  // GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  // use SetupTensor if not has local input image .
+  SetupTensor<float>(&input, {1, 32, 150, 150}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150});
+  auto output = executor.Predict(input, "batch_norm_0.tmp_3",
+                                 "depthwise_conv2d_0.tmp_0", out_ddim);
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -50,7 +50,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fushion_fc_op.cpp
@@ -116,7 +116,7 @@ int main() {
  DLOG << "begin to run Fc Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;
  //  program.originProgram->Description("origin");
  auto optimize_program = optimize.FushionOptimize(program.originProgram);

--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -46,7 +46,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -50,7 +50,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(std::string(g_googlenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -32,7 +32,7 @@ int main() {
                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
  auto output =
-      executor.predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
  float *output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {

--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -127,7 +127,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run PriorBoxOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  /// input x (1,3,300,300)
  paddle_mobile::framework::Tensor input_image;

--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -46,7 +46,7 @@ int main() {
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
  out_ddims.push_back(out_ddim);
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                            output_names, out_ddims);
  auto output0_data = output[0]->data<float>();

--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -31,7 +31,7 @@ int main() {
  auto input_ptr = input.data<float>();
  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
  auto output =
-      executor.predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();
  DLOG << "input : ";

--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::framework::Tensor input;

--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../models/mobilenet"));
+  auto program = loader.Load(std::string(g_mobilenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -30,7 +30,7 @@ int main() {
                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000});
  auto output =
-      executor.predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
+      executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {
    DLOG << " value of output: " << output_ptr[j];

--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -31,7 +31,7 @@ int main() {
  auto input_ptr = input.data<float>();
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2});
  auto output =
-      executor.predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();
  DLOG << "input : ";

--- a/test/test_include.h
+++ b/test/test_include.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "./test_helper.h"
 #include "common/enforce.h"
-#include "common/io.h"
 #include "common/log.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -30,3 +29,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
+#include "io.h"