Merge pull request #1 from PaddlePaddle/develop

pull

Merge pull request #1 from PaddlePaddle/develop
pull
6ad09ae2 · kk12333 · GitHub · 4f9e0290 · 2a345a21 · 6ad09ae2
76 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
-add_definitions(-DPADDLE_MOBILE_DEBUG="true")
+add_definitions(-DPADDLE_MOBILE_DEBUG)
+add_definitions(-DENABLE_EXCEPTION)

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(CMAKE_BUILD_TYPE RelWithDebInfo)

--- a/scripts/push2android.sh
+++ b/scripts/push2android.sh
+#!/usr/bin/env sh
+
+push_fn () {
+MODELS_PATH="../test/models/*"
+EXE_FILE="../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+MODELS_DIR="data/local/tmp/models"
+LIB_PATH="../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+echo "test files sync completed"
+}
+push_fn
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#ifdef PADDLE_MOBILE_DEBUG
+#ifdef ENABLE_EXCEPTION
 #include <stdio.h>
 #include <exception>
 #include <sstream>
@@ -25,7 +25,7 @@ limitations under the License. */

 namespace paddle_mobile {

-#ifdef PADDLE_MOBILE_DEBUG
+#ifdef ENABLE_EXCEPTION
 struct PaddleMobileException : public std::exception {
  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
  std::string message;
@@ -64,7 +64,7 @@ struct PaddleMobileException : public std::exception {
  }
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ASSERT(stat, ...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...)
 #endif

 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -14,6 +14,10 @@ limitations under the License. */

 #pragma once;

+#include <string>
+#include <unordered_map>
+#include <utility>
+
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };

@@ -67,4 +71,49 @@ enum PMStatus {
  PMUnImplError = 0x07,    /*!< Unimplement error. */
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };
+
+static const std::string G_OP_TYPE_CONV = "conv2d";
+static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+static const std::string G_OP_TYPE_CONCAT = "concat";
+static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU =
+    "fusion_conv_add_relu";
+static const std::string G_OP_TYPE_FC = "fc";
+static const std::string G_OP_TYPE_LRN = "lrn";
+static const std::string G_OP_TYPE_MUL = "mul";
+static const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+static const std::string G_OP_TYPE_POOL2D = "pool2d";
+static const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+static const std::string G_OP_TYPE_RELU = "relu";
+static const std::string G_OP_TYPE_RESHAPE = "reshape";
+static const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+static const std::string G_OP_TYPE_SOFTMAX = "softmax";
+static const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+static const std::string G_OP_TYPE_SPLIT = "split";
+static const std::string G_OP_TYPE_FEED = "feed";
+static const std::string G_OP_TYPE_FETCH = "fetch";
+static const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+
+static std::unordered_map<
+    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
+    op_input_output_key = {
+        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BOX_CODER,
+         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
+        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -90,14 +90,6 @@ class OpRegistry {
      const std::string& type, const VariableNameMap& inputs,
      const VariableNameMap& outputs, const AttributeMap attrs,
      std::shared_ptr<paddle_mobile::framework::Scope> scope) {
-    LOG(paddle_mobile::kLOG_DEBUG1) << " type: " << type;
-    LOG(paddle_mobile::kLOG_DEBUG1) << " input size: " << inputs.size();
-    LOG(paddle_mobile::kLOG_DEBUG1) << " output size: " << outputs.size();
-    LOG(paddle_mobile::kLOG_DEBUG1) << " attr size: " << attrs.size();
-    LOG(paddle_mobile::kLOG_DEBUG1)
-        << " OpInfoMap size: " << OpInfoMap<Dtype>::Instance()->map().size();
-    LOG(paddle_mobile::kLOG_DEBUG1) << " has type: " << type << " "
-                                    << OpInfoMap<Dtype>::Instance()->Has(type);
    auto& info = OpInfoMap<Dtype>::Instance()->Get(type);
    auto op = info.Creator()(type, inputs, outputs, attrs, scope);
    return std::shared_ptr<OperatorBase<Dtype>>(op);

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -13,11 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "framework/operator.h"
-#include "framework/op_info.h"
+#include "operators/op_param.h"

 namespace paddle_mobile {
 namespace framework {

+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetOutKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+    return {};
+  }
+  return it->second.second;
+}
+
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                  const VariableNameMap &inputs,
@@ -31,9 +41,22 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
      scope_(scope) {
  CheckAllInputOutputSet();
 }
+
 template <typename Dtype>
 void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}

+template <typename Dtype>
+void OperatorBase<Dtype>::Run() const {
+  RunImpl();
+#ifdef PADDLE_MOBILE_DEBUG
+  vector<string> output_keys = GetOutKeys();
+  for (const auto key : output_keys) {
+    Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
+    DLOG << type_ << " output- " << key << "=" << *out_;
+  }
+#endif
+}
+
 template class OperatorBase<CPU>;
 template class OperatorWithKernel<CPU>;


--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -36,39 +36,60 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace framework {
-static std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {{"conv2d", {{"Input"}, {"Output"}}},
-                           {"relu", {{"X"}, {"Out"}}},
-                           {"softmax", {{"X"}, {"Out"}}},
-                           {"mul", {{"X"}, {"Out"}}},
-                           {"elementwise_add", {{"X", "Y"}, {"Out"}}},
-                           {"pool2d", {{"X"}, {"Out"}}},
-                           {"batch_norm", {{"X"}, {"Y"}}},
-                           {"lrn", {{"X"}, {"Out"}}},
-                           {"concat", {{"X"}, {"Out"}}},
-                           {"feed", {{"X"}, {"Out"}}},
-                           {"fetch", {{"X"}, {"Out"}}}};
+using std::string;
+using std::vector;
+
+template <typename T>
+static T *GetVarValue(const string &key, const VariableNameMap &var_map,
+                      const Scope &scope) {
+  auto var_vec = var_map.at(key);
+  if (!var_vec.empty()) {
+    auto var = scope.FindVar(var_vec[0]);
+    return var->GetMutable<T>();
+  } else {
+    return nullptr;
+  }
+}

 template <typename Dtype>
 class OperatorBase : PaddleMobileObject {
 public:
+  /*
+   *  @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor
+   * */
  OperatorBase(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs, const AttributeMap &attrs,
               std::shared_ptr<Scope> scope);
  virtual ~OperatorBase() {}
-  virtual void Run() const = 0;
-  virtual void InferShape() const = 0;
+  void Run() const;
+  std::vector<string> GetOutKeys() const;
+  virtual void RunImpl() const = 0;

+  /*
+   * @b op 运算所需的输入, 如上一层的输出结果、卷积核
+   * */
  const VariableNameMap &Inputs() const { return inputs_; }
+  /*
+   * @b op 的输出, 内存会提前被分配好, 运算结果会被存到分配好的内存内
+   * */
  const VariableNameMap &Outputs() const { return outputs_; }
+  /*
+   * @b op 类型
+   * */
  const std::string &Type() const { return type_; }
+  /*
+   * @b op 运算所需要用到的参数: 如 conv 运算所需要用到的 stride
+   * */
  const AttributeMap &Attrs() const { return attrs_; }
  void ClearVariables(const std::vector<std::string> &var_names) const {
    if (this->scope_) {
      this->scope_->EraseVars(var_names);
    }
  }
+  /*
+   * @b 根据输入形状和参数计算出输出形状
+   * */
+  virtual void InferShape() const = 0;

 protected:
  std::shared_ptr<Scope> scope_;
@@ -81,6 +102,9 @@ class OperatorBase : PaddleMobileObject {
  void CheckAllInputOutputSet() const;
 };

+/*
+ * @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase
+ * */
 template <typename Dtype>
 class OperatorWithKernel : public OperatorBase<Dtype> {
 public:
@@ -88,15 +112,23 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
                     const VariableNameMap &outputs, const AttributeMap &attrs,
                     std::shared_ptr<Scope> scope)
      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
-  virtual void Run() const = 0;
+
+  virtual void RunImpl() const = 0;
  virtual void InferShape() const = 0;
 };

+/*
+ * @b 所有kernel的父类
+ * */
 template <typename Dtype, typename P>
 class OpKernelBase : PaddleMobileObject {
 public:
+  /*
+   * @b 所有kernel 需实现 Compute 方法
+   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
+   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
+   * */
  virtual void Compute(const P &para) const = 0;
-
  virtual ~OpKernelBase() = default;
 };

@@ -113,13 +145,13 @@ class FusionOpMatcher : PaddleMobileObject {

  virtual std::string Type() = 0;

-  virtual void FolderNodes(Node &node) {
-    node.Folder(node_.Depth(), Type(), {});
+  virtual void FolderNodes(Node *node) {
+    node->Folder(node_.Depth(), Type(), {});
  }

  virtual Node &BeginNode() { return node_; }

-  std::string BeginType() { return node_.BeginType(); }
+  std::string BeginType() { return node_.Type(); }

 protected:
  Node node_;

--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -25,13 +25,7 @@ std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
  return res;
 }

-std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const {
-  std::vector<std::shared_ptr<OpDesc>> res;
-  for (const auto &op : ops_) {
-    res.push_back(op);
-  }
-  return res;
-}
+std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }

 BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
    : index_(desc->idx), parent_index_(desc->idx) {

--- a/src/framework/program/block_desc.h
+++ b/src/framework/program/block_desc.h
@@ -26,6 +26,7 @@ class BlockDesc : PaddleMobileObject {
 public:
  friend class Node;
  friend class ProgramOptimize;
+  BlockDesc() {}
  BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
  BlockDesc(const BlockDesc &block_desc)
      : index_(block_desc.index_), parent_index_(block_desc.parent_index_) {
@@ -43,6 +44,8 @@ class BlockDesc : PaddleMobileObject {

  const int &ID() const { return index_; }

+  const bool &MultiThread() const { return multi_thread_; }
+
  const int &Parent() const { return parent_index_; }

  bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const {
@@ -58,6 +61,7 @@ class BlockDesc : PaddleMobileObject {

 private:
  int index_;
+  bool multi_thread_;
  int parent_index_;
  std::vector<std::shared_ptr<OpDesc>> ops_;
  std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <sstream>

+#include "framework/operator.h"
 #include "framework/program/program-optimize/node.h"

 namespace paddle_mobile {
@@ -44,16 +45,46 @@ bool Node::operator==(const Node &in) {
  return true;
 }

-// std::shared_ptr<Node> Node::MatchTheFirstNode(std::string type){
-//
-//  for (const auto &node : outputs_){
-//    if (node->type_ == type){
-//      return node;
-//    }else{
-//
-//    }
-//  }
-//}
+bool Node::CanSplit(std::unordered_set<std::string> complex_compute_set) {
+  bool split = false;
+  CanSplit(&split, false, 0, &complex_compute_set, this);
+  return split;
+}
+
+void Node::CanSplit(bool *split, bool spliting, int complex_count,
+                    std::unordered_set<std::string> *complex_compute_set,
+                    Node *pre_node) {
+  if (spliting) {
+    if (complex_compute_set->find(this->type_) != complex_compute_set->end()) {
+      complex_count++;
+    }
+  }
+
+  if (inputs_.size() > 1 && pre_node != inputs_.back()) {
+    return;
+  }
+  if (inputs_.size() > 1 && pre_node == inputs_.back()) {
+    if (complex_count > 1) {
+      *split = true;
+      return;
+    }
+  }
+
+  // multi output, to check
+  if (outputs_.size() > 1) {
+    spliting = true;
+    complex_compute_set = 0;
+  } else {
+    if (spliting == true && inputs_.size() > 0) {
+      spliting = false;
+    } else {
+    }
+  }
+
+  for (auto &output : outputs_) {
+    output->CanSplit(split, spliting, complex_count, complex_compute_set, this);
+  }
+}

 std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(uint size) {
  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
@@ -73,44 +104,105 @@ void Node::OpDescs(uint index,
 }

 void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *node) {
-  auto iter = std::find(op_desc->begin(), op_desc->end(), this->op_desc_);
+                   Node *node, bool adding_thread, int thread_num) {
+  if (outputs_.size() > 1) {
+    adding_thread = false;
+  }
+
+  bool can_add_split = false;
+  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
+  if (outputs_.size() > 1 &&
+      op_input_output_key[op_desc_->type_].second.size() == 1) {
+    can_add_split = true;
+
+    // 遍历当前节点的 output 节点
+    for (const auto &output : outputs_) {
+      // 不支持 output 有多个 output 的情况
+      if (output->outputs_.size() > 0) {
+        can_add_split = false;
+        break;
+      }
+
+      //与节点关联的 OpDesc
+      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
+
+      //获取这个 op 的 inputs key 和 outputs key
+      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
+
+      //判断现在 是否存在这个 op
+      //判断这个 output 和 input key 的 size 等于 1
+      if (op_input_output_key.find(op_desc->type_) !=
+              op_input_output_key.end() &&
+          inputs_and_outputs.first.size() == 1 &&
+          inputs_and_outputs.second.size() == 1) {
+        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
+        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
+
+        // 判断一下, 如果输入和输出没有同名, 是支持的
+        for (int i = 0; i < inputs_of_output.size(); ++i) {
+          std::string input_of_output = inputs_of_output[i];
+          for (int j = 0; j < outputs_of_output.size(); ++j) {
+            std::string output_of_output = outputs_of_output[j];
+            if (input_of_output == output_of_output) {
+              DLOG << "output的 output 包含 input" << input_of_output;
+              can_add_split = false;
+              break;
+            }
+          }
+        }
+      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
+        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
+        can_add_split = false;
+      }
+    }
+  }
+
  if (inputs_.size() > 1 && node != inputs_.back()) {
    return;
  } else if (inputs_.size() > 1 && node == inputs_.back()) {
+    adding_thread = false;
    op_desc->push_back(this->op_desc_);
  } else {
    op_desc->push_back(this->op_desc_);
  }
-
-  for (auto &output : outputs_) {
-    output->OpDescs(op_desc, this);
+  if (adding_thread) {
+    Attribute attr;
+    attr.Set<int>(thread_num);
+    this->op_desc_->attrs_["thread"] = attr;
  }
-}
-
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(&op_descs, this);
-  return op_descs;
-}
-
-std::string Node::ToString(std::string blank, const Node *node) const {
-  std::stringstream ss;
-  ss << type_ << "-> \n";

-  if (inputs_.size() > 1 && node != inputs_.back()) {
-    return ss.str();
-  } else if (inputs_.size() > 1 && node == inputs_.back()) {
-    ss << "\n" << blank << type_ << "\n";
+  if (can_add_split) {
+    adding_thread = true;
+    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
+    split_op_desc->type_ = G_OP_TYPE_SPLIT;
+    auto outputs = this->op_desc_->Output(
+        op_input_output_key[this->op_desc_->Type()].second[0]);
+    split_op_desc->inputs_ = {
+        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs =
+        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto &output : outputs_) {
+      split_outputs.push_back(outputs[0]);
+    }
+    DLOG << "add split";
+    op_desc->push_back(split_op_desc);
  }

  for (int i = 0; i < outputs_.size(); ++i) {
-    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
+    auto &output = outputs_[i];
+    if (can_add_split) {
+      output->OpDescs(op_desc, this, adding_thread, i);
+    } else {
+      output->OpDescs(op_desc, this, adding_thread, thread_num);
+    }
  }
-  return ss.str();
 }

-std::string Node::ToString() const { return this->ToString("  ", this); }
+std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
+  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
+  OpDescs(&op_descs, this, false, 0);
+  return op_descs;
+}

 std::shared_ptr<Node> Node::To(int size) {
  std::shared_ptr<Node> node = std::make_shared<Node>();
@@ -118,17 +210,6 @@ std::shared_ptr<Node> Node::To(int size) {
  return node;
 }

-// Node &Node::To(int size) {
-//  if (size == 1) {
-//    this->outputs_.clear();
-//  }
-//
-//  for (int j = 0; j < this->outputs_.size(); ++j) {
-//    outputs_[j]->To(size - 1);
-//  }
-//  return *this;
-//}
-
 void Node::To(int index, std::shared_ptr<Node> node) {
  node->type_ = this->type_;
  if (index != 0) {
@@ -205,6 +286,24 @@ void Node::Folder(
  }
 }

+std::string Node::ToString(std::string blank, const Node *node) const {
+  std::stringstream ss;
+  ss << type_ << "-> \n";
+
+  if (inputs_.size() > 1 && node != inputs_.back()) {
+    return ss.str();
+  } else if (inputs_.size() > 1 && node == inputs_.back()) {
+    ss << "\n" << blank << type_ << "\n";
+  }
+
+  for (int i = 0; i < outputs_.size(); ++i) {
+    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
+  }
+  return ss.str();
+}
+
+std::string Node::ToString() const { return this->ToString("  ", this); }
+
 void Node::Description() {
  if (op_desc_.get()) {
    DLOG << *op_desc_;

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <map>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>

@@ -27,6 +28,8 @@ namespace paddle_mobile {
 namespace framework {

 class Node : PaddleMobileObject {
+  friend class ProgramOptimize;
+
 public:
  Node() {}
  explicit Node(const std::string &type) : type_(type) {}
@@ -34,6 +37,7 @@ class Node : PaddleMobileObject {
      : op_desc_(op_desc), type_(op_desc->Type()) {}
  Node &operator>(std::shared_ptr<Node> node);
  bool operator==(const Node &in);
+  bool CanSplit(std::unordered_set<std::string> complex_compute_set);
  std::string ToString() const;
  std::shared_ptr<Node> To(int size);
  uint Depth(uint begin = 0);
@@ -42,13 +46,16 @@ class Node : PaddleMobileObject {
      std::map<std::string, std::pair<std::string, std::string>> change_map);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
-  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-               Node *node);
-  std::shared_ptr<framework::OpDesc> OpDesc() { return op_desc_; }
-  std::string BeginType() { return type_; }
+  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
+  std::string Type() { return type_; }
  void Description();

 private:
+  void CanSplit(bool *split, bool spliting, int complex_count,
+                std::unordered_set<std::string> *complex_compute_set,
+                Node *pre_node);
+  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+               Node *node, bool adding_thread, int thread_num);
  void OpDescs(uint size,
               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
  void To(int index, std::shared_ptr<Node>);

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -19,11 +19,12 @@ namespace paddle_mobile {

 namespace framework {

-std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
-
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
-    std::shared_ptr<ProgramDesc> ori_des) {
-  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
+    std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
+  //  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
+  std::shared_ptr<ProgramDesc> optimize_program =
+      std::make_shared<ProgramDesc>(*ori_des);
+  current_block_ = optimize_program->Blocks().size();

  for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
    std::unordered_map<std::string, std::shared_ptr<Node>> output_nodes;
@@ -86,7 +87,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
          //          DLOG << " match success " << " fusion node: \n" <<
          //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
          //          DLOG << "match node\n"<< *match_node;
-          matcher->FolderNodes(*match_node);
+          matcher->FolderNodes(match_node.get());
          //          DLOG << " after match node\n"<< *match_node;
          //          match_node->Description();

@@ -96,10 +97,170 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
    }

    //    DLOG << "node: \n" << *begin_node;
-    block->ops_ = begin_node->OpDescs();
+
+    std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
+    //    bool can_splite = begin_node->CanSplit({G_OP_TYPE_CONV,
+    //    G_OP_TYPE_BATCHNORM, G_OP_TYPE_DEPTHWISE_CONV});
+    GenerateOps(&op_descs, begin_node.get());
+    block->ops_ = op_descs;
+  }
+
+  for (int m = 0; m < new_blocks_.size(); ++m) {
+    std::shared_ptr<BlockDesc> new_block = new_blocks_[m];
+    new_block->index_ = m + ori_des->blocks_.size();
+    optimize_program->blocks_.push_back(new_block);
  }
-  std::shared_ptr<ProgramDesc> shared_optimzie(optimize_program);
-  return shared_optimzie;
+  return optimize_program;
 }
+
+void ProgramOptimize::GenerateOps(
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
+    Node *current_node) {
+  if (current_node->inputs_.size() > 1 &&
+      input_node != current_node->inputs_.back()) {
+    return;
+  } else if (current_node->inputs_.size() > 1 &&
+             input_node == current_node->inputs_.back()) {
+    op_desc->push_back(current_node->op_desc_);
+  } else {
+    op_desc->push_back(current_node->op_desc_);
+  }
+
+  for (int i = 0; i < current_node->outputs_.size(); ++i) {
+    auto &output = current_node->outputs_[i];
+    GenerateOps(op_desc, current_node, output.get());
+  }
+}
+
+void ProgramOptimize::GenerateOps(
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
+    Node *current_node, bool adding_thread, int thread_num,
+    std::shared_ptr<BlockDesc> new_block) {
+  if (current_node->outputs_.size() > 1) {
+    adding_thread = false;
+  }
+
+  bool can_add_split = false;
+  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
+  if (current_node->outputs_.size() > 1 &&
+      op_input_output_key[current_node->op_desc_->type_].second.size() == 1) {
+    can_add_split = true;
+
+    // 遍历当前节点的 output 节点
+    for (const auto &output : current_node->outputs_) {
+      // 不支持 output 有多个 output 的情况
+      if (output->outputs_.size() > 1) {
+        DLOG << "don't support multi output of output";
+        can_add_split = false;
+        break;
+      }
+
+      //与节点关联的 OpDesc
+      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
+
+      //获取这个 op 的 inputs key 和 outputs key
+      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
+
+      //判断现在 是否存在这个 op
+      //判断这个 output 和 input key 的 size 等于 1
+      if (op_input_output_key.find(op_desc->type_) !=
+              op_input_output_key.end() &&
+          inputs_and_outputs.first.size() == 1 &&
+          inputs_and_outputs.second.size() == 1) {
+        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
+        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
+
+        // 判断一下, 如果输入和输出没有同名, 是支持的
+        for (int i = 0; i < inputs_of_output.size(); ++i) {
+          std::string input_of_output = inputs_of_output[i];
+          for (int j = 0; j < outputs_of_output.size(); ++j) {
+            std::string output_of_output = outputs_of_output[j];
+            if (input_of_output == output_of_output) {
+              DLOG << "output的 output 包含 input" << input_of_output;
+              can_add_split = false;
+              break;
+            }
+          }
+        }
+      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
+        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
+        can_add_split = false;
+      }
+    }
+  }
+
+  if (current_node->inputs_.size() > 1 &&
+      input_node != current_node->inputs_.back()) {
+    return;
+  } else if (current_node->inputs_.size() > 1 &&
+             input_node == current_node->inputs_.back()) {
+    new_block.reset();
+    adding_thread = false;
+    op_desc->push_back(current_node->op_desc_);
+  } else {
+    if (new_block.get() && adding_thread) {
+      new_block->ops_.push_back(current_node->op_desc_);
+    } else {
+      op_desc->push_back(current_node->op_desc_);
+    }
+  }
+  if (adding_thread) {
+    Attribute attr;
+    attr.Set<int>(thread_num);
+    current_node->op_desc_->attrs_["thread"] = attr;
+  }
+
+  if (can_add_split) {
+    new_block = std::make_shared<BlockDesc>();
+    new_block->multi_thread_ = true;
+    new_block->index_ = current_block_;
+    new_blocks_.push_back(new_block);
+
+    adding_thread = true;
+    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
+    split_op_desc->type_ = G_OP_TYPE_SPLIT;
+    auto outputs = current_node->op_desc_->Output(
+        op_input_output_key[current_node->op_desc_->Type()].second[0]);
+    split_op_desc->inputs_ = {
+        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs =
+        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto &output : current_node->outputs_) {
+      split_outputs.push_back(outputs[0]);
+    }
+
+    Attribute attr;
+    attr.Set<int>(current_block_);
+    split_op_desc->attrs_["block_id"] = attr;
+
+    op_desc->push_back(split_op_desc);
+    current_block_++;
+  }
+
+  for (int i = 0; i < current_node->outputs_.size(); ++i) {
+    auto &output = current_node->outputs_[i];
+    if (can_add_split) {
+      GenerateOps(op_desc, current_node, output.get(), adding_thread, i,
+                  new_block);
+    } else {
+      GenerateOps(op_desc, current_node, output.get(), adding_thread,
+                  thread_num, new_block);
+    }
+  }
+}
+
+void ProgramOptimize::GenerateOps(
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+    Node *begin_node) {
+  // std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+  //             Node *input_node, Node *current_node, bool adding_thread, int
+  //             thread_num
+  if (false) {
+    this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
+  } else {
+    this->GenerateOps(op_descs, begin_node, begin_node);
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -27,14 +27,19 @@ namespace framework {
 class ProgramOptimize {
 public:
  ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> Optimize();
  std::shared_ptr<ProgramDesc> FushionOptimize(
-      std::shared_ptr<ProgramDesc> ori_des);
+      std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);

 private:
-  //                std::shared_ptr<ProgramDesc> ori_desc_;
-  std::vector<std::unordered_map<std::string, std::shared_ptr<Node>>>
-      outputs_nodes_;
+  int current_block_;
+  std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+                   Node *begin_node);
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+                   Node *input_node, Node *current_node);
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+                   Node *input_node, Node *current_node, bool adding_thread,
+                   int thread_num, std::shared_ptr<BlockDesc> new_block);
 };
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/program/program_desc.cpp
+++ b/src/framework/program/program_desc.cpp
@@ -32,11 +32,13 @@ void ProgramDesc::Description(std::string header) {
  if (header.size()) {
    LOG(kLOG_INFO) << header;
  }
-  for (const auto &block : this->blocks_) {
+
+  for (int i = 0; i < this->blocks_.size(); ++i) {
+    auto block = this->blocks_[i];
    LOG(kLOG_DEBUG) << "block: " << block->ID();
    LOG(kLOG_INFO) << "block ops size: " << block->Ops().size();
    for (int j = 0; j < block->Ops().size(); ++j) {
-      const auto &op = block->Ops()[j];
+      auto op = block->Ops()[j];
      LOG(kLOG_DEBUG1) << "op: " << op->Type();
      for (auto &input : op->GetInputs()) {
        LOG(kLOG_DEBUG2) << "input parameter: " << input.first;
@@ -71,6 +73,9 @@ void ProgramDesc::Description(std::string header) {
      }
    }
  }
+
+  for (const auto &block : this->blocks_) {
+  }
 #endif
 }


--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -18,11 +18,12 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <type_traits>
 #include <typeindex>
 #include <vector>

-#include "data_layout.h"
-#include "ddim.h"
+#include "framework/data_layout.h"
+#include "framework/ddim.h"
 #include "memory/t_malloc.h"

 namespace paddle_mobile {
@@ -62,8 +63,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 static inline size_t SizeOfType(std::type_index type) {
  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
  size_t size = functor(type);
-  //  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s",
-  //  type.name());
+
+  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
  return size;
 }

@@ -72,16 +73,27 @@ class LoDTensor;
 class Tensor {
 public:
  Tensor() : offset_(0) {}
+  template <typename T>
+  Tensor(std::vector<T> input, DDim ddim) : offset_(0) {
+    PADDLE_MOBILE_ENFORCE(
+        input.size() == framework::product(ddim),
+        "input vector'length should be equal to tensor's length");
+    auto input_ptr = mutable_data<T>(ddim);
+    for (int i = 0; i < input.size(); ++i) {
+      input_ptr[i] = input[i];
+    }
+  }

  /*! Return a pointer to mutable memory block. */
  template <typename T>
  inline T *data() {
    check_memory_size();
-    //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-    //                     holder_->type().hash_code() ==
-    //                     typeid(T).hash_code(),
-    //                 "Tensor holds the wrong type, it holds %s",
-    //                 this->holder_->type().name());
+    PADDLE_MOBILE_ENFORCE(
+        (std::is_same<T, void>::value ||
+         holder_->type().hash_code() == typeid(T).hash_code()),
+        "Tensor holds the wrong type, it holds %s",
+        this->holder_->type().name());
+
    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
  }
@@ -90,11 +102,11 @@ class Tensor {
  template <typename T>
  inline const T *data() const {
    check_memory_size();
-    //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-    //                     holder_->type().hash_code() ==
-    //                     typeid(T).hash_code(),
-    //                 "Tensor holds the wrong type, it holds %s",
-    //                 this->holder_->type().name());
+    PADDLE_MOBILE_ENFORCE(
+        (std::is_same<T, void>::value ||
+         holder_->type().hash_code() == typeid(T).hash_code()),
+        "Tensor holds the wrong type, it holds %s",
+        this->holder_->type().name());

    return reinterpret_cast<const T *>(
        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -116,17 +128,11 @@ class Tensor {
    if (holder_ != nullptr) {
      holder_->set_type(type);
    }
-    //  PADDLE_ENFORCE_GE(numel(), 0,
-    //                    "When calling this method, the Tensor's
-    //                    numel must be
-    //                    " "equal or larger than zero. " "Please
-    //                    check
-    //                    Tensor::Resize has been called first.");
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
    int64_t size = numel() * SizeOfType(type);
    /* some versions of boost::variant don't have operator!= */
    if (holder_ == nullptr || holder_->size() < size + offset_) {
      holder_.reset(new PlaceholderImpl(size, type));
-
      offset_ = 0;
    }
    return reinterpret_cast<void *>(
@@ -179,16 +185,13 @@ class Tensor {
   */
  inline Tensor Slice(int begin_idx, int end_idx) const {
    check_memory_size();
-    //  PADDLE_ENFORCE_GE(begin_idx, 0,
-    //                    "The start row index must be greater than
-    //                    0.");
-    //  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is
-    //  out of
-    //  bound."); PADDLE_ENFORCE_LT(
-    //      begin_idx, end_idx,
-    //      "The start row index must be lesser than the end row
-    //      index.");
-
+    PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
+                          "The start row index must be greater than 0.")
+    PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
+                          "The end row index is out of bound.")
+    PADDLE_MOBILE_ENFORCE(
+        begin_idx < end_idx,
+        "The start row index must be lesser than the end row index")
    if (dims_[0] == 1) {
      return *this;
    } else {
@@ -205,10 +208,9 @@ class Tensor {
  }

  std::type_index type() const {
-    //                PADDLE_ENFORCE_NOT_NULL(
-    //                        holder_, "Tensor not initialized yet
-    //                        when
-    //                        Tensor::type() is called.");
+    PADDLE_MOBILE_ENFORCE(
+        holder_ != nullptr,
+        "Tensor not initialized yet when Tensor::type() is called.")
    return holder_->type();
  }

@@ -219,13 +221,10 @@ class Tensor {

  inline void check_memory_size() const {
    PADDLE_MOBILE_ENFORCE(
-        holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_MOBILE_ENFORCE(
-        numel() * SizeOfType(type()) <= memory_size(),
-        "Tensor's dims_ is out of bound. CallTensor::mutable_data "
-        "first to re-allocate memory.\n"
-        "or maybe the required data-type mismatches the data\
-          already stored.");
+        holder_ != nullptr,
+        "Tensor holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
+                          "Tensor's dims_ is out of bound. ");
  }

  inline DataLayout layout() const { return layout_; }
@@ -256,13 +255,8 @@ class Tensor {
               memory::PODDeleter<uint8_t>()),
          size_(size),
          type_(type) {
-      //                    PADDLE_ENFORCE_NOT_NULL(ptr_,
-      //                    "Insufficient %s
-      //                    memory to allocation.",
-      //                                            (is_cpu_place(place_)
-      //                                            ?
-      //                                            "CPU" :
-      //                                            "GPU"));
+      PADDLE_MOBILE_ENFORCE(ptr_ != nullptr,
+                            "Insufficient memory to allocation");
    }

    virtual size_t size() const { return size_; }
@@ -320,6 +314,19 @@ class Tensor {
  size_t offset_;
 };

+#ifdef PADDLE_MOBILE_DEBUG
+inline Print &operator<<(Print &printer, const Tensor &tensor) {
+  printer << " dims: " << tensor.dims() << "\n";
+  int stride = tensor.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < tensor.numel(); i += stride) {
+    printer << tensor.data<float>()[i] << " ";
+  }
+  return printer;
+}
+
+#endif
+
 inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
  Tensor res;
  res.ShareDataWith(src);

--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -45,8 +45,6 @@ class Variable : public PaddleMobileObject {

  bool IsInitialized() const { return holder_ != nullptr; }

-  const std::string Name() { return name_; }
-
  template <typename T>
  T *GetMutable() {
    if (!IsType<T>()) {
@@ -64,8 +62,6 @@ class Variable : public PaddleMobileObject {

  std::type_index Type() const { return holder_->Type(); }

-  void SetName(const string name) { name_ = name; }
-
 private:
  struct Placeholder {
    Placeholder() = default;

--- a/src/io.cpp
+++ b/src/io.cpp
@@ -15,12 +15,13 @@ limitations under the License. */
 #include "io.h"
 #include <fstream>
 #include <vector>
+#include "common/log.h"

 #include "common/enforce.h"
-#include "common/log.h"
 #include "framework/framework.pb-c.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
+#include "framework/program/program-optimize/program_optimize.h"
 #include "framework/program/program_desc.h"
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
@@ -45,7 +46,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
  printf("%s \n", file_name);
  FILE *fp;
  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, "open failed !");
+  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);

  fseek(fp, 0, SEEK_END);
  size_t size = ftell(fp);
@@ -53,7 +54,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {

  DLOG << "model size: " << size;

-  *out = (uint8_t *)malloc(size);
+  *out = reinterpret_cast<uint8_t *>(malloc(size));

  size_t cur_len = 0;
  size_t nread;
@@ -167,7 +168,7 @@ void Loader<Dtype, P>::LoadVar(framework::Variable *variable,

 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname) {
+    const std::string &dirname, bool optimize) {
  std::string model_filename = dirname + "/__model__";
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
@@ -177,11 +178,11 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(

  c_program = paddle_mobile__framework__proto__program_desc__unpack(
      NULL, read_size, buf);
-
+  //
  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-
+  //
  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-
+  //
  std::shared_ptr<framework::ProgramDesc> originProgramDesc =
      std::make_shared<framework::ProgramDesc>(c_program);

@@ -204,13 +205,12 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
        if (var_desc->Persistable() &&
            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          //          DLOG << "to load var ";
          auto dim = var_desc->Tensor_desc().Dims();
          auto tensor = var->GetMutable<framework::LoDTensor>();
          tensor->Resize(framework::make_ddim(dim));
        } else {
          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 1, "dim size is 0");
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
          dim[0] = 1;
          auto tensor = var->GetMutable<framework::LoDTensor>();
          tensor->Resize(framework::make_ddim(dim));
@@ -221,7 +221,16 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
    }
  }

-  //  originProgramDesc->Description("program: ");
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FushionOptimize(originProgramDesc);
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }

  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
  return program;
@@ -232,33 +241,9 @@ template class Loader<CPU, Precision::FP32>;
 #pragma mark - executor

 template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p) : program_(p) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
-
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      op_base->InferShape();
-      ops_of_block_[*block_desc.get()].push_back(op_base);
-    }
-  }
-  InitMemory();
-}
-
-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size)
-    : program_(p), batch_size_(batch_size) {
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
+                             bool use_optimize)
+    : program_(p), batch_size_(batch_size), use_optimize_(use_optimize) {
  if (use_optimize_) {
    to_predict_program_ = program_.optimizeProgram;
  } else {
@@ -273,6 +258,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size)
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
+      DLOG << "create op: " << op->Type();
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
@@ -364,7 +350,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,

  is.read(static_cast<char *>(memory), memory_size * type_size);
  is.close();
-};
+}

 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::InitMemory() {
@@ -380,7 +366,8 @@ void Executor<Dtype, P>::InitMemory() {
                   program_.model_path + "/" + var_desc->Name());
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto tensor = var->template GetMutable<framework::Tensor>();
+          auto tensor = var->template GetMutable<framework::LoDTensor>();
+
          tensor->template mutable_data<Ptype>();
        }
      }
@@ -389,65 +376,47 @@ void Executor<Dtype, P>::InitMemory() {
 }

 template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::predict(
-    framework::Tensor &t) {
-  // feed
-  auto scope = program_.scope;
-  framework::Variable *g_feed_value = scope->Var("pixel");
-  auto tensor = g_feed_value->GetMutable<framework::Tensor>();
-  tensor->ShareDataWith(t);
-
-  framework::Variable *con_output = scope->Var("conv2d_0.tmp_0");
-  framework::Tensor *output_tensor =
-      con_output->GetMutable<framework::Tensor>();
-  output_tensor->mutable_data<float>({1, 16, 32, 32});
-  //  std::cout << typeid(output_tensor).name() << std::endl;
-  //  std::cout << "output_tensor dims: " << output_tensor->dims() <<
-  //  std::endl;
-
-  std::shared_ptr<framework::Tensor> out_tensor =
-      std::make_shared<framework::LoDTensor>();
-  out_tensor.reset(output_tensor);
-
-  predict(t, 0);
-  return out_tensor;
-}
-
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  auto feed_tensor = g_feed_value->GetMutable<framework::LoDTensor>();
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
-
  feed_tensor->ShareDataWith(t);
-
  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(block_id);
+      to_predict_program_->Block(0);
  for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
    auto op = ops_of_block_[*to_predict_block.get()][j];
    op->Run();
  }
+  auto ops = ops_of_block_[*to_predict_program_->Block(0)];
+  auto last_op = ops.rbegin();
+  auto output_map = (*last_op)->Outputs();
+  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
+  framework::LoDTensor *output_tensor =
+      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
+                                                   *(program_.scope));
+  return std::shared_ptr<framework::Tensor>(output_tensor);
+}
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t, int block_id) {
+  return Predict(t);
 }

 template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
+std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
-  DLOG << "start predict: ";
-
-  framework::Tensor tensor;
-  auto ddim = framework::make_ddim(dims);
-
-  auto input_ptr = tensor.mutable_data<Ptype>(ddim);
-  for (int i = 0; i < input.size(); ++i) {
-    input_ptr[i] = input[i];
+  framework::Tensor tensor(input, framework::make_ddim(dims));
+  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
+  Executor<Dtype, P>::Ptype *output_ptr =
+      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
+  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
+  for (int j = 0; j < output_tensor->numel(); ++j) {
+    result_vector.push_back(output_ptr[j]);
  }
-
-  predict(tensor, 0);
-
-  framework::Variable *g_feed_value = program_.scope->Var("col");
-  auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();
-
-  return {};
+  return result_vector;
 }

 template class Executor<CPU, Precision::FP32>;

--- a/src/io.h
+++ b/src/io.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <memory.h>
+#include <map>
 #include <string>
 #include <vector>

@@ -30,7 +31,8 @@ namespace paddle_mobile {
 template <typename Dtype, Precision P = Precision::FP32>
 class Loader : PaddleMobileObject {
 public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname);
+  const framework::Program<Dtype, P> Load(const std::string &dirname,
+                                          bool optimize = true);

 private:
  void LoadVar(framework::Variable *variable,
@@ -43,25 +45,25 @@ class Executor {
 public:
  typedef typename PrecisionTrait<P>::ptype Ptype;

-  Executor() = default;
-
-  Executor(const framework::Program<Dtype> p);
-
-  Executor(const framework::Program<Dtype> p, int batch_size);
+  Executor(const framework::Program<Dtype> p, int batch_size = 1,
+           bool use_optimize = true);

-  std::shared_ptr<framework::Tensor> predict(framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);

-  std::vector<Ptype> predict(const std::vector<Ptype> &input,
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);

 protected:
+  Executor() = default;
+
  void InitMemory();
  void LoadMemory(const framework::VarDesc var_desc,
                  framework::LoDTensor *tensor, const std::string &file_path);
  framework::Program<Dtype> program_;
  int batch_size_ = 1;
  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  void predict(const framework::Tensor &t, int block_id);
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
+                                             int block_id);
  std::map<framework::BlockDesc,
           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
      ops_of_block_;

--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -12,19 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
+
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/batchnorm_kernel.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class BatchNormOp : public framework::OperatorWithKernel<DeviceType> {
 public:
-  BatchNormOp(const std::string &type, const VariableNameMap &inputs,
+  BatchNormOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
              const framework::AttributeMap attrs,
              std::shared_ptr<framework::Scope> scope)
@@ -32,7 +33,7 @@ class BatchNormOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::BatchNormKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -36,7 +36,7 @@ class BoxCoderOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::BoxCoderKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -13,25 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/concat_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class ConcatOp : public framework::OperatorWithKernel<DeviceType> {
 public:
-  ConcatOp(const std::string &type, const VariableNameMap &inputs,
+  ConcatOp(const string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap attrs,
           std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::ConcatKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -21,13 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
 template <typename Dtype, typename T>
 void ConvOp<Dtype, T>::InferShape() const {
  //  std::cout << " begin get dims: " << std::endl;

--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -14,14 +14,13 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/conv_kernel.h"

 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class ConvOp : public framework::OperatorWithKernel<DeviceType> {
 public:
@@ -35,7 +34,7 @@ class ConvOp : public framework::OperatorWithKernel<DeviceType> {
  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;

-  void Run() const {
+  void RunImpl() const {
    operators::ConvKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
    this->ClearVariables({"Filter", "Input"});
@@ -45,5 +44,12 @@ class ConvOp : public framework::OperatorWithKernel<DeviceType> {
  ConvParam param_;
 };

+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/depthwise_conv_op.h"
+#include <vector>
+#include "framework/data_type.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
+#include "operators/conv_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void DepthwiseConvOp<Dtype, T>::InferShape() const {
+  auto in_dims = param_.Input()->dims();
+  auto filter_dims = param_.Filter()->dims();
+  const std::vector<int> &strides = param_.Strides();
+  std::vector<int> paddings = param_.Paddings();
+  int groups = param_.Groups();
+  std::vector<int> dilations = param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  param_.Output()->Resize(ddim);
+}
+
+template class DepthwiseConvOp<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+USE_OP(depthwise_conv2d);
+REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp);
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/depthwise_conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class DepthwiseConvOp : public framework::OperatorWithKernel<DeviceType> {
+ public:
+  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+                                                  scope),
+        param_(inputs, outputs, attrs, *scope) {}
+
+  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  void InferShape() const override;
+
+  void RunImpl() const {
+    operators::DepthwiseConvKernel<DeviceType, T> kernel;
+    kernel.Compute(param_);
+    this->ClearVariables({"Filter", "Input"});
+  }
+
+ private:
+  ConvParam param_;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -12,19 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
+
+#include <string>
 #include "framework/operator.h"
 #include "kernel/elementwise_add_kernel.h"
-#include "op_param.h"
+#include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> {
 public:
-  ElementwiseAddOp(const std::string &type, const VariableNameMap &inputs,
+  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap attrs,
                   std::shared_ptr<framework::Scope> scope)
@@ -32,7 +33,7 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::ElementwiseAddKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -14,22 +14,23 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "framework/operator.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
-
+using std::string;
 template <typename DeviceType, typename T>
 class FeedOp : public framework::OperatorBase<DeviceType> {
 public:
-  FeedOp(const std::string &type, const VariableNameMap &inputs,
+  FeedOp(const string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
         std::shared_ptr<framework::Scope> scope)
      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                            scope),
        param_(inputs, outputs, attrs, *scope) {}
-  void Run() const { param_.Out()->ShareDataWith(*param_.InputX()); }
+  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }

  void InferShape() const {
    auto out_dims = param_.Out()->dims();

--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -14,27 +14,24 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "framework/operator.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
+using std::string;

 template <typename DeviceType, typename T>
 class FetchOp : public framework::OperatorBase<DeviceType> {
 public:
-  FetchOp(const std::string &type, const VariableNameMap &inputs,
+  FetchOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
          std::shared_ptr<framework::Scope> scope)
      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                            scope),
        param_(inputs, outputs, attrs, *scope) {}
-  void Run() const {
-    param_.Out()->ShareDataWith(*param_.InputX());
-    for (int i = 0; i < param_.Out()->numel(); ++i) {
-      DLOG << param_.Out()->template data<float>()[i];
-    }
-  }
+  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }

  void InferShape() const {
    auto x_dims = param_.InputX()->dims();

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -23,18 +23,18 @@ namespace operators {
 class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
 public:
  FushionConvAddReluOpMatcher() {
-    node_ = framework::Node("conv2d");
-    node_ > std::make_shared<framework::Node>("elementwise_add") >
-        std::make_shared<framework::Node>("relu");
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
  }

-  void FolderNodes(framework::Node &node) {
+  void FolderNodes(framework::Node *node) {
    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+        node->OpDescs(node_.Depth());
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
  }
-
-  std::string Type() { return "FusionConvAddRelu"; }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };

 class FusionFcOp {

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <string>
+#include <vector>

 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
@@ -22,27 +23,29 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
-
+using std::string;
+using std::vector;
 class FusionFcMatcher : public framework::FusionOpMatcher {
 public:
  FusionFcMatcher() {
-    node_ = framework::Node("mul");
-    node_ > std::make_shared<framework::Node>("elementwise_add");
+    node_ = framework::Node(G_OP_TYPE_MUL);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
  }

-  void FolderNodes(framework::Node &node) {
-    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+  void FolderNodes(framework::Node *node) {
+    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
+        node->OpDescs(node_.Depth());
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
  }

-  std::string Type() { return "fc"; }
+  std::string Type() { return G_OP_TYPE_FC; }
 };

 template <typename DeviceType, typename T>
 class FushionFcOp : public framework::OperatorWithKernel<DeviceType> {
 public:
-  FushionFcOp(const std::string &type, const VariableNameMap &inputs,
+  FushionFcOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
              const framework::AttributeMap attrs,
              std::shared_ptr<framework::Scope> scope)
@@ -50,7 +53,7 @@ class FushionFcOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::FushionFcKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -17,19 +17,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  LOG(kLOG_DEBUG) << param;
@@ -38,19 +25,18 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
  output->mutable_data<float>();
-
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
  std::vector<int> dilations = param.Dilations();

-  DLOG << " compute end get Attrs " << strides[0];
+  //  DLOG << " compute end get Attrs " << strides[0];

  const int batch_size = static_cast<int>(input->dims()[0]);

  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));

+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
  size_t data_dim = filter_shape_vec.size() - 2;
  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
  col_shape_vec[0] = input->dims()[1] / groups;
@@ -71,8 +57,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
-  DLOG << " col_shape = " << col_shape;
-  DLOG << " col_matrix_shape = " << col_matrix_shape;

  framework::DDim input_shape = framework::slice_ddim(
      input->dims(), 1, static_cast<int>(input->dims().size()));
@@ -80,8 +64,7 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
  filter.Resize(filter_matrix_shape);
-  DLOG << " filter.deims() = " << filter.dims();
-
+  DLOG << " filter.dims() = " << filter.dims();
  framework::DDim output_matrix_shape = {
      output->dims()[1],
      output->numel() / (output->dims()[0] * output->dims()[1])};
@@ -118,9 +101,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      DLOG << " out_slice " << out_slice.dims();
-      DLOG << " filter_slice " << filter_slice.dims();
-      DLOG << " col_matrix " << col_matrix.dims();
      math::matmul<float>(filter_slice, false, col_matrix, false,
                          static_cast<float>(1), &out_slice,
                          static_cast<float>(0));

--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/depthwise_conv_kernel.h"
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  LOG(kLOG_DEBUG) << param;
+
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  //  DLOG << " compute end get Attrs " << strides[0];
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  //  DLOG << " col_shape = " << col_shape;
+  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  //  DLOG << " input_shape = " << input_shape;
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  //  DLOG << " filter.dims() = " << filter.dims();
+
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    //    DLOG << " in_batch.dims() = " << in_batch.dims();
+    //    DLOG << " out_batch.dims() = " << out_batch.dims();
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      //      DLOG << " out_slice " << out_slice.dims();
+      //      DLOG << " filter_slice " << filter_slice.dims();
+      //      DLOG << " col_matrix " << col_matrix.dims();
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+      auto filter_ptr = filter_slice.data<float>();
+    }
+  }
+}
+
+template class DepthwiseConvKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -25,6 +25,9 @@ struct ReluFunctor {
  inline T operator()(T in) const { return in > 0 ? in : 0; }
 };

+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  const auto *input_x = param.InputX();

--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <vector>
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
@@ -23,12 +24,28 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-using namespace framework;
+using framework::OpKernelBase;

 template <typename DeviceType, typename T>
-class ConvKernel : public framework::OpKernelBase<DeviceType, ConvParam> {
+class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
 public:
  void Compute(const ConvParam &param) const;
 };
+
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
+ public:
+  void Compute(const ConvParam &param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -11,27 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once

+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/lrn_kernel.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class LrnOp : public framework::OperatorWithKernel<DeviceType> {
 public:
-  LrnOp(const std::string &type, const VariableNameMap &inputs,
+  LrnOp(const string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap attrs,
        std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::LrnKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -136,9 +136,15 @@ class SoftmaxFuntor<CPU, T> {

 public:
  void operator()(const framework::Tensor *X, framework::Tensor *Y) {
+    const DDim dDim = X->dims();
+    for (int i = 0; i < dDim[0]; ++i) {
+      framework::Tensor sub_X = X->Slice(i, i + 1);
+      framework::Tensor sub_Y = Y->Slice(i, i + 1);
+
 #if __ARM_NEON
-    SoftmaxCacl(X, Y);
+      SoftmaxCacl(&sub_X, &sub_Y);
 #endif
+    }
  }
 };


--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -11,7 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once

+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/mul_kernel.h"
 #include "operators/op_param.h"
@@ -19,8 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-using namespace framework;
-
 template <typename DeviceType, typename T>
 class MulOp : public framework::OperatorWithKernel<DeviceType> {
 public:
@@ -31,7 +31,7 @@ class MulOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::MulKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -36,7 +36,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::MultiClassNMSKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -207,7 +207,7 @@ class ConvParam : OpParam {

  const Tensor *Input() const { return input_; }

-  const LoDTensor *Filter() const { return filter_; }
+  const Tensor *Filter() const { return filter_; }

  Tensor *Output() const { return output_; }

@@ -222,7 +222,7 @@ class ConvParam : OpParam {
 private:
  Tensor *input_;
  Tensor *output_;
-  LoDTensor *filter_;
+  Tensor *filter_;
  vector<int> strides_;
  vector<int> paddings_;
  vector<int> dilations_;
@@ -696,6 +696,9 @@ class ReshapeParam : public OpParam {
  bool inplace_;
 };

+/*
+ * @b op 层实例化好这个 param 传递给 kernel 层使用
+ * */
 class ReluParam : public OpParam {
 public:
  ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -717,15 +720,14 @@ class FushionFcParam : public OpParam {
 public:
  FushionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                 const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<Tensor>(inputs, scope);
-    input_y_ = InputYFrom<Tensor>(inputs, scope);
-    input_z_ = InputZFrom<Tensor>(inputs, scope);
-    out_ = OutFrom<Tensor>(outputs, scope);
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
+    input_z_ = InputZFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);
  }
-
  const Tensor *InputX() const { return input_x_; }

  const Tensor *InputY() const { return input_y_; }

--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -17,25 +17,26 @@ limitations under the License. */
 #include <framework/operator.h>
 #include <operators/kernel/pool_kernel.h>
 #include <operators/op_param.h>
+#include <string>

 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
-
+using framework::AttributeMap;
+using framework::OperatorWithKernel;
+using framework::Scope;
+using std::string;
 template <typename DeviceType, typename T>
-class PoolOp : public framework::OperatorWithKernel<DeviceType> {
+class PoolOp : public OperatorWithKernel<DeviceType> {
 public:
-  PoolOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
+  PoolOp(const string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const AttributeMap &attrs,
+         std::shared_ptr<Scope> scope)
+      : OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, scope),
        param_(inputs, outputs, attrs, *scope) {}
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;

-  void Run() const {
-    //        InferShape();
+  void RunImpl() const {
    operators::PoolKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
    this->ClearVariables({"X"});

--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -36,7 +36,7 @@ class PriorBoxOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::PriorBoxKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -25,6 +25,11 @@ template class ReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

+/*
+ * @b 每一个 op 都需要注册一下的,
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
+ * 都是需要和model中类型对应起来的
+ * */
 namespace ops = paddle_mobile::operators;
 USE_OP(relu);
 REGISTER_OPERATOR(relu, ops::ReluOp);
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -28,6 +28,9 @@ using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
 class ReluOp : public framework::OperatorWithKernel<DeviceType> {
 public:
+  /*
+   * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
+   * */
  ReluOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
         std::shared_ptr<framework::Scope> scope)
@@ -35,7 +38,10 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  /*
+   * @b op 进行运算, 调用相应的 kernel 进行运算
+   * */
+  void RunImpl() const {
    operators::ReluKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }
@@ -44,6 +50,10 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
  void InferShape() const override;

 protected:
+  /*
+   * @b Relu kernel 进行运算时所需要用到参数的结构体,
+   *    结构体定义在: paddle-mobile/src/operators/op_param.h
+   * */
  ReluParam param_;
 };


--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -35,7 +35,7 @@ class ReshapeOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::ReshapeKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -36,7 +36,7 @@ class SigmoidOp : public framework::OperatorWithKernel<DeviceType> {

  void InferShape() const override;

-  void Run() const {
+  void RunImpl() const {
    operators::SigmoidKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
    this->ClearVariables({"X"});

--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -36,7 +36,7 @@ class SoftmaxOp : public framework::OperatorWithKernel<DeviceType> {

  void InferShape() const override;

-  void Run() const {
+  void RunImpl() const {
    operators::SoftmaxKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
    this->ClearVariables({"X"});

--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -36,7 +36,7 @@ class TransposeOp : public framework::OperatorWithKernel<DeviceType> {
                                                  scope),
        param_(inputs, outputs, attrs, *scope) {}

-  void Run() const {
+  void RunImpl() const {
    operators::TransposeKernel<DeviceType, T> kernel;
    kernel.Compute(param_);
  }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,11 +11,11 @@ ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_includ
 target_link_libraries(test-mul-op paddle-mobile)

 # gen test
-ADD_EXECUTABLE(test-elementwiseadd-op  operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
+ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
 target_link_libraries(test-elementwiseadd-op paddle-mobile)

 # gen test
-ADD_EXECUTABLE(test-concat-op  operators/test_concat_op.cpp test_helper.h  test_include.h)
+ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
 target_link_libraries(test-concat-op paddle-mobile)

 # gen test
@@ -84,10 +84,33 @@ target_link_libraries(test-gemm paddle-mobile)
 ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
 target_link_libraries(test-enforce paddle-mobile)

+# gen test
+ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-yolo paddle-mobile)
+
 # gen test
 ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
 target_link_libraries(test-googlenet paddle-mobile)

+# gen test
+ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-mobilenet paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-resnet paddle-mobile)
+# gen test
+ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-mobilenetssd paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-squeezenet paddle-mobile)
+
 # gen test
 ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
 target_link_libraries(test-sigmoid paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
+target_link_libraries(test-depthwise-conv-op paddle-mobile)
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -17,10 +17,11 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "./io.h"
 #include "common/log.h"
 #include "framework/op_registry.h"
+#include "io.h"
 #include "operators/conv_op.h"
+#include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
 #include "operators/relu_op.h"
 #include "operators/reshape_op.h"
@@ -37,6 +38,7 @@ using paddle_mobile::framework::Program;
 using paddle_mobile::framework::Tensor;
 using paddle_mobile::framework::Variable;
 using std::string;
+using std::vector;
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
@@ -71,18 +73,60 @@ class Executor4Test : public Executor<DeviceType> {
        }
      }
    }
+    this->InitMemory();
  }

-  std::shared_ptr<Tensor> predict(const Tensor &t, string input, string output,
+  template <typename T = LoDTensor>
+  vector<std::shared_ptr<Tensor>> Predict(const vector<Tensor> &ts,
+                                          const vector<string> &input_names,
+                                          const vector<string> &output_names,
+                                          const vector<DDim> &ddims) {
+    auto scope = this->program_.scope;
+    size_t input_size = input_names.size();
+    size_t out_size = output_names.size();
+
+    vector<Variable *> input_vars(input_size);
+    vector<LoDTensor *> input_tensors(input_size);
+    for (int i = 0; i < input_size; i++) {
+      input_vars[i] = scope->Var(input_names[i]);
+      input_tensors[i] = input_vars[i]->GetMutable<T>();
+      input_tensors[i]->ShareDataWith(ts[i]);
+    }
+
+    vector<Variable *> output_vars(out_size);
+    vector<LoDTensor *> output_tensors(out_size);
+    vector<std::shared_ptr<Tensor>> output_tensor_sptrs(out_size);
+
+    for (int i = 0; i < out_size; i++) {
+      output_vars[i] = scope->Var(output_names[i]);
+      output_tensors[i] = output_vars[i]->GetMutable<T>();
+      output_tensors[i]->mutable_data<float>(ddims[i]);
+      output_tensor_sptrs[i] = std::make_shared<LoDTensor>();
+      output_tensor_sptrs[i].reset(output_tensors[i]);
+    }
+
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
+         ++j) {
+      auto op = this->ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+
+    return output_tensor_sptrs;
+  }
+
+  std::shared_ptr<Tensor> Predict(const Tensor &t, string input, string output,
                                  const DDim &dDim) {
    auto scope = this->program_.scope;
    Variable *g_feed_value = scope->Var(input);
-    auto tensor = g_feed_value->GetMutable<Tensor>();
+    auto tensor = g_feed_value->GetMutable<LoDTensor>();
    tensor->ShareDataWith(t);

    Variable *con_output = scope->Var(output);
-    auto *output_tensor = con_output->GetMutable<Tensor>();
+    auto *output_tensor = con_output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>(dDim);
+
    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
    out_tensor.reset(output_tensor);


--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "../test_helper.h"
 #include "io.h"

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(g_googlenet);
+  program.optimizeProgram->Description("program desc: ");
  return 0;
 }
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
 #include "io.h"
@@ -19,7 +20,7 @@ limitations under the License. */
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;
  //  program.originProgram->Description("origin");
  auto optimize_program = optimize.FushionOptimize(program.originProgram);

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -13,25 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <fstream>
-
 #include "../test_helper.h"
 #include "../test_include.h"
-#include "io.h"

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto program = loader.Load(std::string("../models/googlenet"));
-
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
-
+  bool optimize = true;
+  auto time1 = time();
+  auto program = loader.Load(g_googlenet, optimize);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
  std::vector<float> input;
  std::vector<int64_t> dims{1, 3, 224, 224};
  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-
-  //  DLOG << " input: " << input;
-  executor.predict(input, dims);
-
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
  return 0;
 }
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_mobilenet_ssd, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 300, 300};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_mobilenet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 2, false);
+
+  std::vector<int64_t> dims{2, 3, 224, 224};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {2, 3, 224, 224}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  auto vec_result = executor.Predict(input, dims);
+  float sum = 0;
+  for (const auto item : vec_result) {
+    sum += item;
+  }
+  DLOG << "mobilenet output sum =" << sum;
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_resnet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 32, 32};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  auto program = loader.Load(g_squeezenet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 227, 227};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  auto program = loader.Load(g_yolo, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 227, 227};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -68,27 +68,27 @@ class TestBatchNormOp {
    // feed
    auto scope = program_.scope;
    Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *mean_feed_value = scope->Var("batch_norm_0.w_1");
-    auto tensor_mean = mean_feed_value->GetMutable<Tensor>();
+    auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
    tensor_mean->ShareDataWith(t2);

    Variable *scale_feed_value = scope->Var("batch_norm_0.w_0");
-    auto tensor_scale = scale_feed_value->GetMutable<Tensor>();
+    auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
    tensor_scale->ShareDataWith(t3);

    Variable *variance_feed_value = scope->Var("batch_norm_0.w_2");
-    auto tensor_variance = variance_feed_value->GetMutable<Tensor>();
+    auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
    tensor_variance->ShareDataWith(t4);

    Variable *bias_feed_value = scope->Var("batch_norm_0.b_0");
-    auto tensor_bias = bias_feed_value->GetMutable<Tensor>();
+    auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
    tensor_bias->ShareDataWith(t5);

    Variable *output = scope->Var("batch_norm_0.tmp_2");
-    auto *output_tensor = output->GetMutable<Tensor>();
+    auto *output_tensor = output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({4, 10, 2, 2});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();
@@ -128,8 +128,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BatchNormOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(
-      "../../test/models/image_classification_resnet.inference.model"));
+  auto program = loader.Load(std::string(g_resnet));

  /// input x (4,10,2,2)
  paddle_mobile::framework::Tensor inputx1;

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -62,19 +62,19 @@ class TestBoxCoderOp {
    // feed
    auto scope = program_.scope;
    Variable *prior_box = scope->Var("concat_0.tmp_0");
-    auto tensor_x1 = prior_box->GetMutable<Tensor>();
+    auto tensor_x1 = prior_box->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *prior_box_var = scope->Var("concat_1.tmp_0");
-    auto tensor_x2 = prior_box_var->GetMutable<Tensor>();
+    auto tensor_x2 = prior_box_var->GetMutable<LoDTensor>();
    tensor_x2->ShareDataWith(t2);

    Variable *target_box = scope->Var("concat_2.tmp_0");
-    auto tensor_x3 = target_box->GetMutable<Tensor>();
+    auto tensor_x3 = target_box->GetMutable<LoDTensor>();
    tensor_x3->ShareDataWith(t3);

    Variable *boxes_output = scope->Var("box_coder_0.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<Tensor>();
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
    boxes_output_tensor->mutable_data<float>({1, 1917, 4});

    //  DLOG << typeid(output_tensor).name();
@@ -116,7 +116,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BoxCoderOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));

  paddle_mobile::framework::Tensor priorbox;
  SetupTensor<float>(&priorbox, {1917, 4}, static_cast<float>(0),

--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -12,148 +12,64 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/concat_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestConcatOp {
- public:
-  explicit TestConcatOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "concat" && op->Input("X")[0] == "conv2d_3.tmp_1") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << " axis : " << op->GetAttrMap().at("axis").Get<int>();
-
-          std::shared_ptr<operators::ConcatOp<Dtype, float>> concat =
-              std::make_shared<operators::ConcatOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(concat);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_concat(const Tensor &t1, const Tensor &t2,
-                                         const Tensor &t3, const Tensor &t4) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("conv2d_3.tmp_1");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("conv2d_5.tmp_1");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *x3_feed_value = scope->Var("conv2d_7.tmp_1");
-    auto tensor_x3 = x3_feed_value->GetMutable<Tensor>();
-    tensor_x3->ShareDataWith(t3);
-
-    Variable *x4_feed_value = scope->Var("conv2d_8.tmp_1");
-    auto tensor_x4 = x4_feed_value->GetMutable<Tensor>();
-    tensor_x4->ShareDataWith(t4);
-
-    Variable *con_output = scope->Var("concat_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({4, 100, 2, 2});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_concat(t1, t2, t3, t4, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_concat(const Tensor &t1, const Tensor &t2, const Tensor &t3,
-                      const Tensor &t4, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestConcatOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
 int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ConcatOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/googlenet"));
-
-  /// input x (4,10,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-  /// input x (4,20,2,2)
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {4, 20, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-  /// input x (4,30,2,2)
-  paddle_mobile::framework::Tensor inputx3;
-  SetupTensor<float>(&inputx3, {4, 30, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx3_ptr = inputx3.data<float>();
-  /// input x (4,40,2,2)
-  paddle_mobile::framework::Tensor inputx4;
-  SetupTensor<float>(&inputx4, {4, 40, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx4_ptr = inputx4.data<float>();
-
-  paddle_mobile::framework::TestConcatOp<paddle_mobile::CPU> testConcatOp(
-      program);
-
-  auto output_concat =
-      testConcatOp.predict_concat(inputx1, inputx2, inputx3, inputx4);
-  auto *output_concat_ptr = output_concat->data<float>();
-
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::CPU, float>>
+      executor(program, "concat");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  // 5. test one example.
  int input_n = 1;
  int input_c = 2;
  int input_h = 0;
  int input_w = 1;
-  int stride0 = inputx3.numel() / inputx3.dims()[0];
-  int stride1 = inputx3.numel() / inputx3.dims()[0] / inputx3.dims()[1];
-  int stride2 = inputx3.dims()[3];
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
  /// inputx1 (4,10,2,2),
  /// inputx2 (4,20,2,2),
  /// inputx3 (4,30,2,2),
@@ -163,10 +79,10 @@ int main() {
  int input_index =
      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
  int output_index = input_n * 100 * 2 * 2 +
-                     (input_c + inputx1.dims()[1] + inputx2.dims()[1]) * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
                     input_h * 2 + input_w;

-  DLOG << " inputx3[1,2,0,1] = " << inputx3_ptr[input_index];
-  DLOG << " output[1,12,0,1] = " << output_concat_ptr[output_index];
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
  return 0;
 }
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -34,7 +34,7 @@ int main() {
  //                     static_cast<float>(1));

  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
-  auto output = executor.predict(input, "data", "conv2d_0.tmp_0", out_ddim);
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);

  auto output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {

--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/depthwise_conv_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet_ssd);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::DepthwiseConvOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "depthwise_conv2d");
+
+  paddle_mobile::framework::LoDTensor input;
+  // GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  // use SetupTensor if not has local input image .
+  SetupTensor<float>(&input, {1, 32, 150, 150}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150});
+  auto output = executor.Predict(input, "batch_norm_0.tmp_3",
+                                 "depthwise_conv2d_0.tmp_0", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -12,133 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
-#include "operators/elementwise_add_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestElementwiseAddOp {
- public:
-  explicit TestElementwiseAddOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "elementwise_add" &&
-            op->Input("X")[0] == "batch_norm_2.tmp_2") {
-          DLOG << " elementwise_add attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          Attribute axis_attr = op->GetAttrMap().at("axis");
-          int axis = axis_attr.Get<int>();
-          DLOG << " Attr axis is : " << axis;
-
-          std::shared_ptr<operators::ElementwiseAddOp<Dtype, float>> add =
-              std::make_shared<operators::ElementwiseAddOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(add);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_add(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("batch_norm_2.tmp_2");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("batch_norm_0.tmp_3");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *con_output = scope->Var("elementwise_add_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({1, 3, 224, 224});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");

-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::ElementwiseAddOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "elementwise_add");

-    predict_add(t1, t2, 0);
-    return out_tensor;
-  }
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;

- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 3, 224, 224}, 0, 1);
+  input_tensors.push_back(input1);

-  void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {224}, 0, 1);
+  input_tensors.push_back(input2);

-template class TestElementwiseAddOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ElementAddOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program =
-      loader.Load(std::string("../models/"
-                              "image_classification_resnet.inference.model"));
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_2.tmp_2",
+      "batch_norm_0.tmp_3",
+  });

-  /// input x (1,3,224,224)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {1, 3, 224, 224}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-  /// input y (224,)
-  paddle_mobile::framework::Tensor inputy;
-  SetupTensor<float>(&inputy, {224}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputy_ptr = inputy.data<float>();
+  // 3. output_names
+  vector<string> output_names({"elementwise_add_0.tmp_0"});

-  paddle_mobile::framework::TestElementwiseAddOp<paddle_mobile::CPU>
-      testElementwiseAddOp(program);
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
+  out_ddims.push_back(out_ddim);

-  auto output_add = testElementwiseAddOp.predict_add(inputx, inputy);
-  auto *output_add_ptr = output_add->data<float>();
-  //            for (int j = 0; j < output_add->numel(); ++j) {
-  //                DLOG << "value of output: " << output_add_ptr[j];
-  //            }
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);

+  auto output0_data = output[0]->data<float>();
  /// output (1,3,224,224)
-  DLOG << "output memory size : " << output_add->memory_size();
-  DLOG << "output numel : " << output_add->numel();
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();

-  DLOG << inputx_ptr[226] << " + " << inputy_ptr[2] << " = "
-       << output_add_ptr[226];
-  return 0;
+  DLOG << input1_data[226] << " + " << input2_data[2] << " = "
+       << output0_data[226];
 }
--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fushion_fc_op.cpp
@@ -64,24 +64,24 @@ class TestFcOp {
    // feed
    auto scope = program_.scope;
    Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
+    auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
    tensor_x->ShareDataWith(t1);

    Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
+    auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
    tensor_y->ShareDataWith(t2);

    Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
-    auto tensor_z = z_feed_value->GetMutable<Tensor>();
+    auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
    tensor_z->ShareDataWith(t3);

    Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
+    auto *output_tensor = con_output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({3, 10});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();

-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
    out_tensor.reset(output_tensor);

    predict(t1, t2, t3, 0);
@@ -116,7 +116,7 @@ int main() {
  DLOG << "begin to run Fc Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;
  //  program.originProgram->Description("origin");
  auto optimize_program = optimize.FushionOptimize(program.originProgram);
@@ -130,17 +130,17 @@ int main() {
  }

  /// input x (1,3,224,224)
-  paddle_mobile::framework::Tensor inputx;
+  paddle_mobile::framework::LoDTensor inputx;
  SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
                     static_cast<float>(1));
  auto *inputx_ptr = inputx.data<float>();
  /// input y (224,)
-  paddle_mobile::framework::Tensor inputy;
+  paddle_mobile::framework::LoDTensor inputy;
  SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
                     static_cast<float>(1.5));
  auto *inputy_ptr = inputy.data<float>();

-  paddle_mobile::framework::Tensor inputz;
+  paddle_mobile::framework::LoDTensor inputz;
  SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *inputz_ptr = inputz.data<float>();

--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -12,118 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/lrn_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestLrnOp {
- public:
-  explicit TestLrnOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "lrn" && op->Input("X")[0] == "pool2d_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << " n : " << op->GetAttrMap().at("n").Get<int>();
-          DLOG << " alpha : " << op->GetAttrMap().at("alpha").Get<float>();
-          DLOG << " beta : " << op->GetAttrMap().at("beta").Get<float>();
-          DLOG << " k : " << op->GetAttrMap().at("k").Get<float>();
-          std::shared_ptr<operators::LrnOp<Dtype, float>> lrn =
-              std::make_shared<operators::LrnOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_lrn(const Tensor &t1) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("pool2d_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *con_output = scope->Var("pool1_norm1.tmp_1");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({3, 4, 2, 2});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");

-    predict_lrn(t1, 0);
-    return out_tensor;
-  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::LrnOp<paddle_mobile::CPU, float>>
+      executor(program, "lrn");

- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;

-  void predict_lrn(const Tensor &t1, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 4, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);

-template class TestLrnOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+  });

-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run LrnOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/googlenet"));
+  // 3. output_names
+  vector<string> output_names({"pool1_norm1.tmp_1"});

-  /// input x (3,4,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {3, 4, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
+  out_ddims.push_back(out_ddim);

-  paddle_mobile::framework::TestLrnOp<paddle_mobile::CPU> testLrnOp(program);
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);

-  auto output_lrn = testLrnOp.predict_lrn(inputx1);
-  auto *output_lrn_ptr = output_lrn->data<float>();
+  auto output0_data = output[0]->data<float>();

  DLOG << " LrnOp input: ";
  for (int i = 0; i < 3; i++) {
    for (int j = 0; j < 4; j++) {
      for (int c = 0; c < 2; c++) {
        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", inputx1_ptr[i * 16 + j * 4 + c * 2 + d]);
+          DLOGF("%f ", input1_data[i * 16 + j * 4 + c * 2 + d]);
        }
        DLOGF("\n");
      }
@@ -136,7 +69,7 @@ int main() {
    for (int j = 0; j < 4; j++) {
      for (int c = 0; c < 2; c++) {
        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", output_lrn_ptr[i * 16 + j * 4 + c * 2 + d]);
+          DLOGF("%f ", output0_data[i * 16 + j * 4 + c * 2 + d]);
        }
        DLOGF("\n");
      }
@@ -144,8 +77,8 @@ int main() {
    }
    DLOGF("\n");
  }
-  DLOG << inputx1_ptr[0] << " / ((1 + 0.00002 * ( " << inputx1_ptr[0] << "^2 + "
-       << inputx1_ptr[4] << "^2 + " << inputx1_ptr[8] << "^2 ))^0.75) = ";
-  DLOG << output_lrn_ptr[0];
+  DLOG << input1_data[0] << " / ((1 + 0.00002 * ( " << input1_data[0] << "^2 + "
+       << input1_data[4] << "^2 + " << input1_data[8] << "^2 ))^0.75) = ";
+  DLOG << output0_data[0];
  return 0;
 }
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,158 +12,81 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestMulOp {
- public:
-  explicit TestMulOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "mul" && op->Input("X")[0] == "pool2d_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << "x_num_col_dims : "
-               << op->GetAttrMap().at("x_num_col_dims").Get<int>();
-          DLOG << "y_num_col_dims : "
-               << op->GetAttrMap().at("y_num_col_dims").Get<int>();
-
-          std::shared_ptr<operators::MulOp<Dtype, float>> mul =
-              std::make_shared<operators::MulOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(mul);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_mul(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("pool2d_0.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("fc_0.w_0");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *con_output = scope->Var("fc_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({3, 3});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_mul(t1, t2, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_mul(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestMulOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
 int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run MulOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program =
-      loader.Load(std::string("../../test/models/"
-                              "image_classification_resnet.inference.model"));
-
-  /// input x (3,2,1,1)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {3, 2, 1, 1}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-
-  /// input y (2,3)
-  paddle_mobile::framework::Tensor inputy;
-  SetupTensor<float>(&inputy, {2, 3}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputy_ptr = inputy.data<float>();
-
-  paddle_mobile::framework::TestMulOp<paddle_mobile::CPU> testMulOp(program);
-
-  auto output_mul = testMulOp.predict_mul(inputx, inputy);
-  auto *output_mul_ptr = output_mul->data<float>();
-
-  auto dimx_1 = inputx.numel() / inputx.dims()[0];
-  DLOG << " inputx : ";
-  for (int i = 0; i < inputx.dims()[0]; ++i) {
-    for (int j = 0; j < dimx_1; ++j) {
-      DLOGF("%f ", inputx_ptr[i * dimx_1 + j]);
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
+      executor(program, "mul");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
+  input_tensors.push_back(input2);
+
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+      "fc_0.w_0",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"fc_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  auto dim_1 = input1.numel() / input1.dims()[0];
+  DLOG << " input1 : ";
+  for (int i = 0; i < input1.dims()[0]; ++i) {
+    for (int j = 0; j < dim_1; ++j) {
+      DLOGF("%f ", input1_data[i * dim_1 + j]);
    }
    DLOGF("\n");
  }

-  auto dimy_1 = inputy.numel() / inputy.dims()[0];
-  DLOG << " inputy : ";
-  for (int i = 0; i < inputy.dims()[0]; ++i) {
-    for (int j = 0; j < dimy_1; ++j) {
-      DLOGF("%f ", inputy_ptr[i * dimx_1 + j]);
+  auto dim_2 = input2.numel() / input2.dims()[0];
+  DLOG << " input2 : ";
+  for (int i = 0; i < input2.dims()[0]; ++i) {
+    for (int j = 0; j < dim_2; ++j) {
+      DLOGF("%f ", input2_data[i * dim_2 + j]);
    }
    DLOGF("\n");
  }

-  auto dim_output_1 = output_mul->numel() / output_mul->dims()[0];
+  auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
  DLOG << " output : ";
-  for (int i = 0; i < output_mul->dims()[0]; ++i) {
-    for (int j = 0; j < dim_output_1; ++j) {
-      DLOGF("%f ", output_mul_ptr[i * dimy_1 + j]);
+  for (int i = 0; i < output[0]->dims()[0]; ++i) {
+    for (int j = 0; j < dim_output0; ++j) {
+      DLOGF("%f ", output0_data[i * dim_2 + j]);
    }
    DLOGF("\n");
  }

  /// output (3,3)
-  DLOG << "output memory size : " << output_mul->memory_size();
-  DLOG << "output numel : " << output_mul->numel();
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();

-  DLOG << inputx_ptr[0] << " x " << inputy_ptr[0] << " + " << inputx_ptr[1]
-       << " x " << inputy_ptr[0 + 3] << " = " << output_mul_ptr[0];
+  DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
+       << " x " << input2_data[0 + 3] << " = " << output0_data[0];
  return 0;
 }
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -77,15 +77,15 @@ class TestMultiClassNMSOp {
    // feed
    auto scope = program_.scope;
    Variable *x1_feed_value = scope->Var("box_coder_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *x2_feed_value = scope->Var("transpose_12.tmp_0");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
    tensor_x2->ShareDataWith(t2);

    Variable *output = scope->Var("detection_output_0.tmp_0");
-    auto *output_tensor = output->GetMutable<Tensor>();
+    auto *output_tensor = output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({1917, 6});

    //  DLOG << typeid(output_tensor).name();

--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -18,7 +18,7 @@ limitations under the License. */

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(std::string(g_googlenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -32,7 +32,7 @@ int main() {
                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
  auto output =
-      executor.predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);

  float *output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {

--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -72,19 +72,19 @@ class TestPriorBoxOp {
    // feed
    auto scope = program_.scope;
    Variable *x1_feed_value = scope->Var("image");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *x2_feed_value = scope->Var("batch_norm_26.tmp_3");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
    tensor_x2->ShareDataWith(t2);

    Variable *boxes_output = scope->Var("prior_box_1.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<Tensor>();
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
    boxes_output_tensor->mutable_data<float>({10, 10, 6, 4});

    Variable *variances_output = scope->Var("prior_box_1.tmp_1");
-    auto *variances_output_tesnor = variances_output->GetMutable<Tensor>();
+    auto *variances_output_tesnor = variances_output->GetMutable<LoDTensor>();
    variances_output_tesnor->mutable_data<float>({10, 10, 6, 4});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();
@@ -127,7 +127,7 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run PriorBoxOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));

  /// input x (1,3,300,300)
  paddle_mobile::framework::Tensor input_image;

--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -14,12 +14,11 @@ limitations under the License. */

 #include "../executor_for_test.h"
 #include "../test_include.h"
+#include "operators/relu_op.h"

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_mobilenet_ssd);
-
+  auto program = loader.Load(g_resnet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

@@ -27,17 +26,33 @@ int main() {
                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
      executor(program, "relu");

-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 2, 3, 4}, static_cast<float>(-1),
-                     static_cast<float>(1));
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });

+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
-  auto output = executor.predict(input, "batch_norm_0.tmp_2",
-                                 "batch_norm_0.tmp_3", out_ddim);
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();

-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
  }
  return 0;
 }
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */

 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io.h"

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -31,7 +31,7 @@ int main() {
  auto input_ptr = input.data<float>();
  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
  auto output =
-      executor.predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();

  DLOG << "input : ";

--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,21 +14,17 @@ limitations under the License. */

 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io.h"

 int main() {
  paddle_mobile::framework::Tensor input;
  paddle_mobile::framework::Tensor output;
-  DLOG << 1;
  SetupTensor<float>(&input, {1, 4, 60, 60}, static_cast<float>(0),
                     static_cast<float>(1));
-  DLOG << 2;

  auto out_ddim = paddle_mobile::framework::make_ddim({1, 4, 60, 60});
  output.Resize(out_ddim);
-  DLOG << 3;
  paddle_mobile::operators::sigmoid(&input, &output);
-  DLOG << 4;
  auto *output_ptr = output.data<float>();
  for (int j = 0; j < output.numel(); ++j) {
    DLOG << " value of output: " << output_ptr[j];

--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */

 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io.h"

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("models/mobilenet"));
+  auto program = loader.Load(std::string(g_mobilenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -30,7 +30,7 @@ int main() {
                     static_cast<float>(1));
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000});
  auto output =
-      executor.predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
+      executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();
  for (int j = 0; j < output->numel(); ++j) {
    DLOG << " value of output: " << output_ptr[j];

--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */

 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io.h"

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";
  }
@@ -31,7 +31,7 @@ int main() {
  auto input_ptr = input.data<float>();
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2});
  auto output =
-      executor.predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
  auto *output_ptr = output->data<float>();

  DLOG << "input : ";

--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <chrono>
 #include <fstream>
 #include <random>

@@ -27,8 +28,22 @@ static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const std::string g_squeezenet = "../models/squeezenet";
 static const std::string g_resnet =
    "../models/image_classification_resnet.inference.model";
+static const std::string g_yolo = "../models/yolo";
 static const std::string g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
+using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Tensor;
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+
+Time time() { return std::chrono::high_resolution_clock::now(); }
+
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}

 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
@@ -43,6 +58,12 @@ void SetupTensor(paddle_mobile::framework::Tensor *input,
  }
 }

+template <typename T>
+T *CreateInput(Tensor *input, DDim dims, T low, T up) {
+  SetupTensor<T>(input, dims, static_cast<float>(low), static_cast<float>(up));
+  return input->data<T>();
+}
+
 template <typename T>
 void GetInput(const std::string &input_name, std::vector<T> *input,
              const std::vector<int64_t> &dims) {