Support feed multi inputs and fetch multi outputs

9729edac · hjchen2 · f20c9041 · 9729edac · 9729edac · 9729edac
24 changed file
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -28,11 +28,6 @@ limitations under the License. */
 #include "framework/tensor.h"
 #include "memory/t_malloc.h"

-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <queue>
-#include "common/threadpool.h"
-#endif
-
 #ifdef PADDLE_MOBILE_CL
 #include "framework/cl/cl_image.h"
 #endif
@@ -40,66 +35,67 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-using framework::Variable;
-using framework::Variable;
-
 #pragma mark - executor

-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
-                             const bool use_optimize, const bool loddable)
-    : program_(p),
+template <typename Device, typename T>
+Executor<Device, T>::Executor(const Program<Device> &program, int batch_size,
+                              const bool use_optimize, const bool lod_mode)
+    : program_(program),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
-      loddable_(loddable) {
+      lod_mode_(lod_mode) {
+  DLOG << "executor in lod mode: " << lod_mode_;
+
  Variable *variable_ptr = program_.scope->Var("batch_size");
  variable_ptr->SetValue<int>(batch_size);
-  to_predict_program_ =
+
+  program_desc_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
-  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
-                        "to_predict_program_ == NULL!");
-  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
-      to_predict_program_->Blocks();
+  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
+                        "program_desc_ should not be nullptr");
+  const auto &blocks = program_desc_->Blocks();
+  ops_of_block_.resize(blocks.size());

-  DLOG << "executor in loaddable mode: " << loddable_;
  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
+    std::shared_ptr<BlockDesc> block_desc = blocks[i];
+    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << op->Type();
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      // infer shape to reshape tensor before predict,
-      // but for lod tensor, it will still need to reshape in runtime
-      if (!loddable_) {
-        op_base->InferShape();
+      std::shared_ptr<OpDesc> op_desc = ops[j];
+      DLOG << "create op: " << op_desc->Type();
+      auto op_handler = OpRegistry<Device>::CreateOp(
+          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
+          op_desc->GetAttrMap(), program_.scope);
+      // infer shape to reshape inputs and outputs before predict,
+      // but for lod mode, it still need to infer shape in runtime
+      if (!lod_mode) {
+        op_handler->InferShape();
      }
-      ops_of_block_[*block_desc.get()].push_back(op_base);
+      ops_of_block_[i].push_back(op_handler);
    }
  }
+
  if (program_.combined) {
    InitCombineMemory();
  } else {
    InitMemory();
  }
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  int i = 0;
-  auto &ops = ops_of_block_[*to_predict_block.get()];
-  for (const auto &op : ops) {
-    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
-    op->Init();
+
+  int count = 0;
+  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
+    for (auto &op_handler : ops_of_block_[block_id]) {
+      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
+      op_handler->Init();
+      ops_list_.push_back(op_handler);
+    }
  }
 }

-template <typename Dtype>
-static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
+template <typename Device>
+static void LoadMemInternal(void **data, LoDTensor *tensor,
                            bool quant_uint8 = false) {
  char **data_buf = reinterpret_cast<char **>(data);
  int64_t size = tensor->numel();
-  Dtype *tensor_data = tensor->mutable_data<Dtype>();
+  Device *tensor_data = tensor->mutable_data<Device>();
  if (quant_uint8) {
    // should be moved into operator init function
    float min_value;
@@ -114,15 +110,15 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
    }
    data_buf += size * sizeof(uint8_t);
  } else {
-    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
-    *data_buf += size * sizeof(Dtype);
+    memory::Copy(tensor_data, *data_buf, size * sizeof(Device));
+    *data_buf += size * sizeof(Device);
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::LoadMemory(
-    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
-    framework::LoDTensor *tensor) {
+template <typename Device, typename T>
+void Executor<Device, T>::LoadMemory(void **data,
+                                     const std::shared_ptr<VarDesc> var_desc,
+                                     LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
  // version
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
@@ -152,18 +148,18 @@ void Executor<Dtype, P>::LoadMemory(
  // skip tensor desc
  *data_buf += tensor_desc_size;

-  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
-  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
+  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
+  tensor->Resize(make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP32:
+    case VARTYPE_TYPE_FP32:
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
      break;
-    case framework::VARTYPE_TYPE_INT8:
+    case VARTYPE_TYPE_INT8:
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
      break;
-    case framework::VARTYPE_TYPE_INT32:
+    case VARTYPE_TYPE_INT32:
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
      break;
    default:
@@ -171,12 +167,12 @@ void Executor<Dtype, P>::LoadMemory(
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InitMemory() {
-  for (const auto &block : to_predict_program_->Blocks()) {
+template <typename Device, typename T>
+void Executor<Device, T>::InitMemory() {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
-      auto tensor = var->template GetMutable<framework::LoDTensor>();
+      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
@@ -187,7 +183,7 @@ void Executor<Dtype, P>::InitMemory() {
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          varInputMemory(var_desc, var, tensor);
        }
      }
@@ -195,8 +191,8 @@ void Executor<Dtype, P>::InitMemory() {
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InitCombineMemory() {
+template <typename Device, typename T>
+void Executor<Device, T>::InitCombineMemory() {
  char *origin_data = nullptr;
  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
@@ -208,17 +204,17 @@ void Executor<Dtype, P>::InitCombineMemory() {
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
-  for (const auto &block : to_predict_program_->Blocks()) {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
-      auto tensor = var->template GetMutable<framework::LoDTensor>();
+      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          varInputMemory(var_desc, var, tensor);
        }
      }
@@ -230,168 +226,132 @@ void Executor<Dtype, P>::InitCombineMemory() {
  LOG(kLOG_INFO) << "init combine memory finish";
 }

-template <typename Dtype, Precision P>
-bool Executor<Dtype, P>::varInputMemory(
-    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
-    framework::LoDTensor *tensor) const {
+template <typename Device, typename T>
+bool Executor<Device, T>::varInputMemory(
+    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
+    LoDTensor *tensor) const {
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
-    case framework::VARTYPE_TYPE_FP32:
+    case VARTYPE_TYPE_FP32:
      tensor->mutable_data<float>();
      break;
-    case framework::VARTYPE_TYPE_INT8:
+    case VARTYPE_TYPE_INT8:
      tensor->mutable_data<int8_t>();
      break;
-    case framework::VARTYPE_TYPE_INT32:
+    case VARTYPE_TYPE_INT32:
      tensor->mutable_data<int32_t>();
      break;
-    case framework::VARTYPE_TYPE_INT64:
+    case VARTYPE_TYPE_INT64:
      tensor->mutable_data<int64_t>();
      break;
    default:
      break;
  }
-  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
-                       (type == framework::VARTYPE_TYPE_INT8) ||
-                       (type == framework::VARTYPE_TYPE_INT32) ||
-                       (type == framework::VARTYPE_TYPE_INT64);
+  bool is_mute_match =
+      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
+      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
  return is_mute_match;
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
-    const framework::Tensor &t) {
-  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  framework::Tensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  auto &ops = ops_of_block_[*to_predict_block.get()];
-
-#ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops.size());
-#endif
-  for (int i = 0; i < ops.size(); i++) {
-#ifdef PADDLE_MOBILE_PROFILE
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-    if (loddable_) {
-      ops[i]->InferShape();
-    }
-    // to Run
-    ops[i]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-  }
-  auto last_op = ops.rbegin();
-  auto output_map = (*last_op)->Outputs();
-  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
-  framework::LoDTensor *output_tensor =
-      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
-                                                   *(program_.scope));
-#ifdef PADDLE_MOBILE_PROFILE
-  std::unordered_map<std::string, uint64_t> _tp;
-  for (int i = 0; i < profile.size(); i++) {
-    const auto &pInfo = profile[i];
-    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    if (ops[i]->Type() == "conv2d") {
-      auto inputs = ops[i]->Inputs();
-      auto *filter = framework::GetVarValue<framework::LoDTensor>(
-          "Filter", inputs, *(program_.scope));
-      int kernel_size = filter->dims()[2];
-      _tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
-    } else {
-      _tp[ops[i]->Type()] += timeCost;
-    }
-  }
-  printf("====================[ profile ]======================\n");
-  using prof_t = std::pair<std::string, uint64_t>;
-  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
-  uint64_t _ptotal = 0;
-  for (auto const &p : _tv) {
-    _ptotal += p.second;
+template <typename Device, typename T>
+PMStatus Executor<Device, T>::Predict(
+    const std::vector<std::pair<std::string, Tensor>> &inputs) {
+  for (const auto &input : inputs) {
+    SetInput(input.second, input.first);
  }
-  auto compf = [](const prof_t &a, const prof_t &b) {
-    return a.second > b.second;
-  };
-  std::sort(_tv.begin(), _tv.end(), compf);
-  _tv.push_back(std::make_pair("total", _ptotal));
-  for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
-           static_cast<float>(p.second),
-           static_cast<float>(p.second) / _ptotal * 100.0);
+  return this->Predict();
+}
+
+template <typename Device, typename T>
+PMStatus Executor<Device, T>::Predict(
+    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
+  for (const auto &input : inputs) {
+    SetInput(input.second, input.first);
  }
-  printf("====================[---------]======================\n");
-#endif
-  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+  return this->Predict();
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
-    const framework::LoDTensor &t) {
-  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  framework::LoDTensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-  feed_tensor->set_lod(t.lod());
+template <typename Device, typename T>
+std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
+                                            const std::vector<int64_t> &dims) {
+  Tensor feed_tensor(input, make_ddim(dims));
+  SetInput(feed_tensor, "feed");
+  std::vector<T> output;
+  if (this->Predict() == PMSuccess) {
+    const auto output_tensor = GetOutput("fetch");
+    output.resize(output_tensor->numel());
+    memcpy(output.data(), output_tensor->template data<T>(),
+           output.size() * sizeof(T));
+  }
+  return output;
+}

-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
+template <typename Device, typename T>
+void Executor<Device, T>::SetInput(const Tensor &input,
+                                   const std::string &var_name) {
+  auto *target_var = program_.scope->FindVar(var_name);
+  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                        var_name.c_str());
+  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  target_tensor->Resize(input.dims());
+  target_tensor->ShareDataWith(input);
+}

-  auto &ops = ops_of_block_[*to_predict_block.get()];
+template <typename Device, typename T>
+void Executor<Device, T>::SetInput(const LoDTensor &input,
+                                   const std::string &var_name) {
+  auto *target_var = program_.scope->FindVar(var_name);
+  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                        var_name.c_str());
+  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  target_tensor->Resize(input.dims());
+  target_tensor->ShareDataWith(input);
+  target_tensor->set_lod(input.lod());
+}

+template <typename Device, typename T>
+PMStatus Executor<Device, T>::Predict() {
 #ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops.size());
+  std::vector<ProfInfo> profile(ops_list_.size());
+  struct timespec ts;
+  int op_index = 0;
 #endif
-  for (int i = 0; i < ops.size(); i++) {
+  for (auto &block : ops_of_block_) {
+    for (auto &op_handler : block) {
 #ifdef PADDLE_MOBILE_PROFILE
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+      clock_gettime(CLOCK_MONOTONIC, &ts);
+      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
-    if (loddable_) {
-      ops[i]->InferShape();
-    }
-    ops[i]->Run();
+      if (lod_mode_) {
+        op_handler->InferShape();
+      }
+      op_handler->Run();
 #ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+      clock_gettime(CLOCK_MONOTONIC, &ts);
+      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+      ++op_index;
 #endif
+    }
  }
-  auto last_op = ops.rbegin();
-
-  auto output_map = (*last_op)->Outputs();
-  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
-  framework::LoDTensor *output_tensor =
-      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
-                                                   *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    if (ops[i]->Type() == "conv2d") {
-      auto inputs = ops[i]->Inputs();
-      auto input_keys = ops[i]->GetInputKeys();
-      auto *filter = framework::GetVarValue<framework::LoDTensor>(
-          input_keys[1], inputs, *(program_.scope));
+    if (ops_list_[i]->Type() == "conv2d" ||
+        ops_list_[i]->Type() == "depthwise_conv2d") {
+      auto inputs = ops_list_[i]->Inputs();
+      auto *filter =
+          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
      int kernel_size = filter->dims()[2];
-      printf("kernel size: %d\n", kernel_size);
+      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
+    } else {
+      _tp[ops_list_[i]->Type()] += timeCost;
    }
-    _tp[ops[i]->Type()] += timeCost;
  }
-  printf("====================[ profile ]======================\n");
-  using prof_t = std::pair<std::string, uint64_t>;
+  DLOG << "====================[ profile ]======================";
+  typedef std::pair<std::string, uint64_t> prof_t;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
@@ -407,57 +367,39 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
-  printf("====================[---------]======================\n");
+  DLOG << "====================[---------]======================";
 #endif
-  return std::make_shared<framework::LoDTensor>(
-      framework::LoDTensor(*output_tensor));
+  return PMSuccess;
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
-    const framework::Tensor &t, int block_id) {
-  return Predict(t);
-}
-
-template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
-    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
-  framework::Tensor tensor(input, framework::make_ddim(dims));
-  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
-  if (output_tensor != nullptr) {
-    Executor<Dtype, P>::Ptype *output_ptr =
-        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
-    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
-    for (int j = 0; j < output_tensor->numel(); ++j) {
-      result_vector.push_back(output_ptr[j]);
-    }
-    return result_vector;
-  } else {
-    DLOG << "return  empty vector";
-    return {};
-  }
+template <typename Device, typename T>
+std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
+    const std::string &var_name) {
+  auto *target_var = program_.scope->FindVar(var_name);
+  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                        var_name.c_str());
+  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
+  return std::make_shared<LoDTensor>(*output_tensor);
 }

 #ifdef PADDLE_MOBILE_FPGA
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
-                                        std::string var_name) {
-  framework::Variable *g_feed_value = program_.scope->Var(var_name);
-  framework::Tensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
+template <typename Device, typename T>
+void Executor<Device, T>::InjectVariable(const Tensor &t,
+                                         std::string var_name) {
+  Variable *g_feed_value = program_.scope->Var(var_name);
+  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
+template <typename Device, typename T>
+void Executor<Device, T>::FeedData(const Tensor &t) {
  InjectVariable(t, "feed");
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
+template <typename Device, typename T>
+std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
+  std::shared_ptr<BlockDesc> to_predict_block = program_desc_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];

  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
@@ -465,15 +407,14 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
-  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
-      out_keys[0], output_map, *(program_.scope));
-  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+  auto *output_tensor =
+      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
+  return std::make_shared<Tensor>(Tensor(*output_tensor));
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict_From_To(int start, int end) {
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
+template <typename Device, typename T>
+void Executor<Device, T>::Predict_From_To(int start, int end) {
+  std::shared_ptr<BlockDesc> to_predict_block = program_desc_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
  end = end < 0 ? static_cast<int>(ops.size()) : end;
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
@@ -498,25 +439,26 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict_From(int start) {
+template <typename Device, typename T>
+void Executor<Device, T>::Predict_From(int start) {
  Predict_From_To(start);
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict_To(int end) {
+template <typename Device, typename T>
+void Executor<Device, T>::Predict_To(int end) {
  Predict_From_To(0, end);
 }
 #endif

 #ifdef PADDLE_MOBILE_CL
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    float *tensorInput, char **data) {}
+template <typename Device, typename T>
+void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
+                                     char **data) {}

 template <>
-void Executor<GPU_CL, Precision::FP32>::LoadMemory(
-    const framework::VarDesc var_desc, float *tensorInput, char **data) {
+void Executor<GPU_CL, Precision::FP32>::LoadMemory(const VarDesc var_desc,
+                                                   float *tensorInput,
+                                                   char **data) {
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

@@ -554,38 +496,13 @@ void Executor<GPU_CL, Precision::FP32>::LoadMemory(
  }
  (*data) += (sizeof(char) * size);

-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+  const TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
-  //            int type_size = 0;
-  //            switch (desc.DataType()) {
-  //                case framework::VARTYPE_TYPE_FP16:
-  //                    type_size = 2;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_FP32:
-  //                    type_size = 4;
-  //                    memory = tensor->mutable_data<float>();
-  //                    break;
-  //                case framework::VARTYPE_TYPE_FP64:
-  //                    type_size = 8;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_INT32:
-  //                    memory = tensor->mutable_data<int32_t>();
-  //                    type_size = 4;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_INT64:
-  //                    type_size = 8;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_BOOL:
-  //                    type_size = 1;
-  //                    break;
-  //                default:
-  //                    break;
-  //            }
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
@@ -616,24 +533,24 @@ void Executor<GPU_CL, Precision::FP32>::LoadMemory(
 }

 template <>
-void Executor<GPU_CL, Precision::FP32>::InitMemory() {
-  for (const auto &block : to_predict_program_->Blocks()) {
+void Executor<GPU_CL, float>::InitMemory() {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        CLImage *cl_image = nullptr;
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensor>();
+          var->template GetMutable<LoDTensor>();
          continue;
        } else {
-          cl_image = var->template GetMutable<framework::CLImage>();
+          cl_image = var->template GetMutable<CLImage>();
        }

        char *origin_data =
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
        char *data = origin_data;
        cl_context context = program_.scope->GetCLScpoe()->Context();
-        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        const TensorDesc &desc = var_desc->Tensor_desc();
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
@@ -643,7 +560,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);

-        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        DDim ddim = make_ddim(desc.Dims());

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
@@ -651,15 +568,15 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
        delete origin_data;
        paddle_mobile::memory::Free(tensorInput);
      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto cl_image = var->template GetMutable<framework::CLImage>();
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
+          auto cl_image = var->template GetMutable<CLImage>();
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

-          const framework::TensorDesc &desc = var_desc->Tensor_desc();
-          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
-          framework::DDim ddim = cl_image->dims();
+          const TensorDesc &desc = var_desc->Tensor_desc();
+          //          DDim ddim = make_ddim(desc.Dims());
+          DDim ddim = cl_image->dims();
          DLOG << var_desc->Name();
          cl_image->InitEmptyImage(context, command_queue, ddim);
        }
@@ -669,7 +586,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
 }

 template <>
-void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
+void Executor<GPU_CL, float>::InitCombineMemory() {
  char *origin_data = nullptr;
  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
@@ -683,22 +600,22 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
  float *data = reinterpret_cast<float *>(origin_data);

-  for (const auto &block : to_predict_program_->Blocks()) {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        CLImage *cl_image = nullptr;
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensor>();
+          var->template GetMutable<LoDTensor>();
          continue;
        } else {
-          cl_image = var->template GetMutable<framework::CLImage>();
+          cl_image = var->template GetMutable<CLImage>();
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

-        const framework::TensorDesc &desc = var_desc->Tensor_desc();
-        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        const TensorDesc &desc = var_desc->Tensor_desc();
+        DDim ddim = make_ddim(desc.Dims());

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
@@ -713,13 +630,13 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {

        paddle_mobile::memory::Free(tensorInput);
      } else {
-        auto cl_image = var->template GetMutable<framework::CLImage>();
+        auto cl_image = var->template GetMutable<CLImage>();
        cl_context context = program_.scope->GetCLScpoe()->Context();
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
-        const framework::TensorDesc &desc = var_desc->Tensor_desc();
-        framework::DDim ddim = cl_image->dims();
-        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        const TensorDesc &desc = var_desc->Tensor_desc();
+        DDim ddim = cl_image->dims();
+        //  DDim ddim = make_ddim(desc.Dims());
        cl_image->InitEmptyImage(context, command_queue, ddim);
      }
    }
@@ -732,13 +649,13 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {

 #endif

-template class Executor<CPU, Precision::FP32>;
+template class Executor<CPU, float>;

-template class Executor<FPGA, Precision::FP32>;
+template class Executor<FPGA, float>;

-template class Executor<GPU_CL, Precision::FP32>;
+template class Executor<GPU_CL, float>;

-template class Executor<GPU_MALI, Precision::FP32>;
+template class Executor<GPU_MALI, float>;

 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "common/types.h"
 #include "common/util.h"
@@ -28,41 +29,29 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device, typename T = float>
 class Executor {
 public:
-  typedef typename PrecisionTrait<P>::ptype Ptype;
-  // exector constructor
-  // @param program program converted from proto program in PaddlePaddle
-  // @param use_optimize bool whether use operator fusion to speed up or not
-  // @param loddable bool
-  Executor(const framework::Program<Dtype> program, int batch_size = 1,
-           const bool use_optimize = true, const bool loddable = false);
-
-  // predict with tensor input
-  // @param t input tensor to do prediction
-  // @return predicted tensor
-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
-
-  // predict with lod tensor input
-  // @param t input lod tensor to do prediction
-  // @return predicted lod tensor
-  std::shared_ptr<framework::LoDTensor> PredictLod(
-      const framework::LoDTensor &t);
-
-  // predict with vector input and dims
-  // @param input vector whose elements will be formed
-  // @param       input lod tensor to do prediction
-  // @param dims  vector whose elements will be formed
-  // @param       input tensor shape
-  // @return vector which is flatted from predicted tensor
-  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
-                             const std::vector<int64_t> &dims);
+  Executor(const Program<Device> &program, int batch_size = 1,
+           const bool use_optimize = true, const bool lod_mode = false);
+
+  PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
+  PMStatus Predict(
+      const std::vector<std::pair<std::string, LoDTensor>> &inputs);
+
+  std::vector<T> Predict(const std::vector<T> &input,
+                         const std::vector<int64_t> &dims);
+  PMStatus Predict();
+
+  void SetInput(const Tensor &input, const std::string &var_name);
+  void SetInput(const LoDTensor &input, const std::string &var_name);
+
+  std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);

 #ifdef PADDLE_MOBILE_FPGA
-  void InjectVariable(const framework::Tensor &t, std::string var_name);
-  void FeedData(const framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
+  void InjectVariable(const Tensor &t, std::string var_name);
+  void FeedData(const Tensor &t);
+  std::shared_ptr<Tensor> FetchResult(int id = -1);
  void Predict_From_To(int start = 0, int end = -1);
  void Predict_From(int start);
  void Predict_To(int end);
@@ -70,26 +59,28 @@ class Executor {

 protected:
  Executor() = default;
-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
-                                             int block_id);
-  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
-                      framework::Variable *var,
-                      framework::LoDTensor *tensor) const;
+
+  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var,
+                      LoDTensor *tensor) const;
  void InitMemory();
  void InitCombineMemory();
-  void LoadMemory(void **data,
-                  const std::shared_ptr<framework::VarDesc> var_desc,
-                  framework::LoDTensor *tensor);
+  void LoadMemory(void **data, const std::shared_ptr<VarDesc> var_desc,
+                  LoDTensor *tensor);
 #ifdef PADDLE_MOBILE_CL
-  void LoadMemory(const framework::VarDesc var_desc, float *tensorInput,
-                  char **data);
+  void LoadMemory(const VarDesc var_desc, float *tensorInput, char **data);
 #endif
-  framework::Program<Dtype> program_;
-  int batch_size_ = 1;
-  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
-      ops_of_block_;
+
+  int batch_size_;
+  bool use_optimize_;
+  bool lod_mode_;
+  Program<Device> program_;
+  std::shared_ptr<ProgramDesc> program_desc_;
+
+  typedef std::shared_ptr<OperatorBase<Device>> OperatorBasePtr;
+  std::vector<std::vector<OperatorBasePtr>> ops_of_block_;
+  // operators list
+  std::vector<OperatorBasePtr> ops_list_;
+
 #ifdef PADDLE_MOBILE_PROFILE
  struct ProfInfo {
    int tid = 0;
@@ -97,8 +88,6 @@ class Executor {
    uint64_t runEnd = 0UL;
  };
 #endif
-  bool use_optimize_ = false;
-  bool loddable_ = false;
 };

 }  // namespace framework

--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -23,14 +23,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-/**
- * muteandresize tensor as originProgramDesc and scope in loadParams
- *
- * @param originProgramDesc
- * @param scope
- */
-template <typename Dtype, Precision P>
-void Loader<Dtype, P>::InitMemoryFromProgram(
+template <typename Device, typename T>
+void Loader<Device, T>::InitMemoryFromProgram(
    const std::shared_ptr<ProgramDesc> &originProgramDesc,
    const std::shared_ptr<Scope> &scope) {
  for (const auto &block : originProgramDesc.get()->Blocks()) {
@@ -43,8 +37,6 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
          tensor->Resize(make_ddim(dim));
        } else {
          auto dim = var_desc->Tensor_desc().Dims();
-          //          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          //          dim[0] = 1;
          if (dim.size() == 0) {
            auto tensor = var->GetMutable<LoDTensor>();
            framework::DDim dDim = {0};
@@ -60,7 +52,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
          }
        }
      } else {
-        // TODO(codeWorm): some.
+        // TODO(codeWorm)
      }
    }
  }
@@ -68,7 +60,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(

 #ifdef PADDLE_MOBILE_CL
 template <>
-void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
+void Loader<GPU_CL, float>::InitMemoryFromProgram(
    const std::shared_ptr<ProgramDesc> &originProgramDesc,
    const std::shared_ptr<Scope> &scope) {
  for (const auto &block : originProgramDesc.get()->Blocks()) {
@@ -77,7 +69,6 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
        if (var_desc->Persistable()) {
          auto dim = var_desc->Tensor_desc().Dims();
-          //              auto tensor = var->GetMutable<LoDTensor>();
          auto cl_image = var->GetMutable<framework::CLImage>();
          cl_image->Resize(make_ddim(dim));
        } else {
@@ -88,14 +79,13 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
          cl_image->Resize(make_ddim(dim));
        }
      } else {
-        // TODO(codeWorm): some.
+        // TODO(codeWorm)
      }
    }
  }
 }
 template <>
-const Program<GPU_CL, Precision::FP32>
-Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
+const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
    uint8_t *combined_params_buf, bool optimize, bool quantification) {
  bool can_add_split = false;
@@ -113,7 +103,7 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(

  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);

-  Program<GPU_CL, Precision::FP32> program;
+  Program<GPU_CL, float> program;
  program.combined = true;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
@@ -145,16 +135,16 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(

 /**
 * fusion and print someinfos
- * @tparam Dtype
+ * @tparam Device
 * @tparam P
 * @param optimize
 * @param can_add_split
 * @param program
 * @param originProgramDesc
 */
-template <typename Dtype, Precision P>
+template <typename Device, typename T>
 void FusionAndPrintInfos(
-    bool optimize, bool can_add_split, Program<Dtype, P> *program,
+    bool optimize, bool can_add_split, Program<Device, T> *program,
    const std::shared_ptr<ProgramDesc> &originProgramDesc) {
  if (optimize) {
    ProgramOptimize program_optimize;
@@ -193,22 +183,22 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
  return cur_len;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &dirname,
-                                               bool optimize,
-                                               bool quantification,
-                                               bool can_add_split) {
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::Load(const std::string &dirname,
+                                                 bool optimize,
+                                                 bool quantification,
+                                                 bool can_add_split) {
  auto program = this->LoadProgram(dirname + "/__model__", optimize,
                                   quantification, can_add_split);
  program.model_path = dirname;
  return program;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
-                                               const std::string &para_path,
-                                               bool optimize,
-                                               bool quantification) {
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
+                                                 const std::string &para_path,
+                                                 bool optimize,
+                                                 bool quantification) {
  auto program = this->LoadProgram(model_path, optimize, quantification);

  program.para_path = para_path;
@@ -217,8 +207,8 @@ const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
  return program;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::LoadProgram(
    const std::string &model_path, bool optimize, bool quantification,
    bool can_add_split) {
  std::string model_filename = model_path;
@@ -237,7 +227,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  //
  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);

-  Program<Dtype, P> program;
+  Program<Device, T> program;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
  program.combined_params_len = 0;
@@ -254,8 +244,8 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  return program;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
    uint8_t *combined_params_buf, bool optimize, bool quantification) {
  bool can_add_split = false;
@@ -273,7 +263,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(

  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);

-  Program<Dtype, P> program;
+  Program<Device, T> program;
  program.combined = true;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
@@ -289,13 +279,13 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
  return program;
 }

-template class Loader<CPU, Precision::FP32>;
+template class Loader<CPU, float>;

-template class Loader<FPGA, Precision::FP32>;
+template class Loader<FPGA, float>;

-template class Loader<GPU_MALI, Precision::FP32>;
+template class Loader<GPU_MALI, float>;

-template class Loader<GPU_CL, Precision::FP32>;
+template class Loader<GPU_CL, float>;

 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/loader.h
+++ b/src/framework/loader.h
@@ -22,39 +22,39 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device = CPU, typename T = float>
 class Loader {
 public:
  /*
   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
+   * @b 加载分开存储的fluid模型
   * */
-  const Program<Dtype, P> Load(const std::string &dirname,
-                               bool optimize = false,
-                               bool quantification = false,
-                               bool can_add_split = false);
+  const Program<Device, T> Load(const std::string &dirname,
+                                bool optimize = false,
+                                bool quantification = false,
+                                bool can_add_split = false);

  /*
   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
+   * @b 加载统一存储的fluid模型
   * */
-  const Program<Dtype, P> Load(const std::string &model_path,
-                               const std::string &para_path,
-                               bool optimize = false,
-                               bool quantification = false);
+  const Program<Device, T> Load(const std::string &model_path,
+                                const std::string &para_path,
+                                bool optimize = false,
+                                bool quantification = false);

-  const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
-                                             const uint8_t *model_buf,
-                                             size_t combined_params_len,
-                                             uint8_t *combined_params_buf,
-                                             bool optimize = false,
-                                             bool quantification = false);
+  const Program<Device, T> LoadCombinedMemory(size_t model_len,
+                                              const uint8_t *model_buf,
+                                              size_t combined_params_len,
+                                              uint8_t *combined_params_buf,
+                                              bool optimize = false,
+                                              bool quantification = false);

 private:
-  const Program<Dtype, P> LoadProgram(const std::string &model_path,
-                                      bool optimize = false,
-                                      bool quantification = false,
-                                      bool can_add_split = false);
+  const Program<Device, T> LoadProgram(const std::string &model_path,
+                                       bool optimize = false,
+                                       bool quantification = false,
+                                       bool can_add_split = false);

  void InitMemoryFromProgram(
      const std::shared_ptr<ProgramDesc> &originProgramDesc,

--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -16,12 +16,12 @@ limitations under the License. */

 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
-#include "tensor.h"
-#include "tensor_util.h"
+#include "framework/tensor.h"
+#include "framework/tensor_util.h"

 namespace paddle_mobile {
-
 namespace framework {

 /*
@@ -202,5 +202,29 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor);

 void DeserializeFromStream(std::istream &is, LoDTensor *tensor);

+#ifdef PADDLE_MOBILE_DEBUG
+inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
+  printer << " dims: " << tensor.dims() << "\n";
+  int stride = tensor.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+#ifndef PADDLE_MOBILE_FPGA
+  for (int i = 0; i < tensor.numel(); i += stride) {
+    if (tensor.type() == typeid(float)) {
+      printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
+    } else if (tensor.type() == typeid(int64_t)) {
+      printer << tensor.data<int64_t>()[i] << " ";
+    } else if (tensor.type() == typeid(int8_t)) {
+      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
+    }
+  }
+#endif  // PADDLE_MOBILE_FPGA
+  return printer;
+}
+#endif  // PADDLE_MOBILE_DEBUG
+
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -14,16 +14,15 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "common/types.h"
 #include "framework/program/program_desc.h"
 #include "framework/scope.h"

-#include <string>
-
 namespace paddle_mobile {
 namespace framework {

-template <typename Dtype, Precision P = Precision::FP32>
+template <typename Device, typename T = float>
 class Program {
 public:
  std::shared_ptr<ProgramDesc> originProgram;

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -26,6 +26,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace framework {
+
 class Scope {
 public:
  Scope() = default;

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -226,7 +226,6 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
    }
  }
 #endif
-
  return printer;
 }


--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -18,17 +18,17 @@

 namespace paddle_mobile {

-template <typename Dtype, Precision P>
-PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
+template <typename Device, typename T>
+PaddleMobilePredictor<Device, T>::PaddleMobilePredictor(
    const PaddleMobileConfig &config) {
  PADDLE_MOBILE_ENFORCE(Init(config) == true,
                        "paddle mobile predictor init failed!");
  config_ = config;
 }

-template <typename Dtype, Precision P>
-bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
-  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
+template <typename Device, typename T>
+bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
+  paddle_mobile_.reset(new PaddleMobile<Device, T>());
 #ifdef PADDLE_MOBILE_CL
  paddle_mobile_->SetCLPath(config.cl_path);
 #endif
@@ -52,8 +52,8 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
  paddle_mobile_->SetThreadNum(config.thread_num);
  return true;
 }
-template <typename Dtype, Precision P>
-bool PaddleMobilePredictor<Dtype, P>::Run(
+template <typename Device, typename T>
+bool PaddleMobilePredictor<Device, T>::Run(
    const std::vector<PaddleTensor> &inputs,
    std::vector<PaddleTensor> *output_data, int batch_size) {
  if (inputs.empty()) {
@@ -78,12 +78,12 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
  framework::Tensor input_tensor;
  input_tensor.Resize(ddim);
  int input_length = framework::product(ddim);
-  typedef typename PrecisionTrait<P>::ptype PType;
-  auto input_ptr = input_tensor.mutable_data<PType>();
+  auto input_ptr = input_tensor.mutable_data<T>();

-  memcpy(input_ptr, static_cast<PType *>(input.data.data()),
-         input_length * sizeof(PType));
-  auto output_tensor = paddle_mobile_->Predict(input_tensor);
+  memcpy(input_ptr, static_cast<T *>(input.data.data()),
+         input_length * sizeof(T));
+  paddle_mobile_->Predict(input_tensor);
+  auto output_tensor = paddle_mobile_->Fetch();

  if (output_data->empty()) {
    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
@@ -99,18 +99,18 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
    output.shape.push_back(static_cast<int>(d));
  }

-  if (output.data.length() < output_length * sizeof(PType)) {
-    output.data.Resize(output_length * sizeof(PType));
+  if (output.data.length() < output_length * sizeof(T)) {
+    output.data.Resize(output_length * sizeof(T));
  }

-  memcpy(output.data.data(), output_tensor->template data<PType>(),
-         output_length * sizeof(PType));
+  memcpy(output.data.data(), output_tensor->template data<T>(),
+         output_length * sizeof(T));

  return true;
 }

-template <typename Dtype, Precision P>
-PaddleMobilePredictor<Dtype, P>::~PaddleMobilePredictor() {
+template <typename Device, typename T>
+PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
  paddle_mobile_->Clear();
 }

@@ -122,13 +122,13 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
  std::unique_ptr<PaddlePredictor> x;
  if (config.precision == PaddleMobileConfig::FP32) {
    if (config.device == PaddleMobileConfig::kCPU) {
-      x.reset(new PaddleMobilePredictor<CPU, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<CPU, float>(config));
    } else if (config.device == PaddleMobileConfig::kFPGA) {
-      x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<FPGA, float>(config));
    } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
-      x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<GPU_MALI, float>(config));
    } else if (config.device == PaddleMobileConfig::kGPU_CL) {
-      x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<GPU_CL, float>(config));
    } else {
      LOG(kLOG_ERROR) << "unsupport device type!";
      return nullptr;

--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -29,7 +29,7 @@ limitations under the License. */

 namespace paddle_mobile {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device = CPU, typename T = float>
 class PaddleMobilePredictor : public PaddlePredictor {
 public:
  PaddleMobilePredictor() = delete;
@@ -43,7 +43,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
  ~PaddleMobilePredictor() override;

 private:
-  std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
+  std::unique_ptr<PaddleMobile<Device, T>> paddle_mobile_;
  bool Init(const PaddleMobileConfig& config);

  PaddleMobileConfig config_;

--- a/src/io/ios_io/PaddleMobileCPU.mm
+++ b/src/io/ios_io/PaddleMobileCPU.mm
@@ -48,7 +48,7 @@

 @interface  PaddleMobileCPU()
 {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
  BOOL loaded_;
 }
 @end
@@ -59,7 +59,7 @@ static std::mutex shared_mutex;

 - (instancetype)init {
  if (self = [super init]) {
-    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>();
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
  }
  return self;
 }
@@ -220,7 +220,8 @@ static std::mutex shared_mutex;
  memcpy(input_ptr, input,
         numel * sizeof(float));

-  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Predict(input_tensor);
+  pam_->Predict(input_tensor);
+  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Fetch();

  float *output_pointer = new float[output->numel()];


--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -16,21 +16,23 @@ limitations under the License. */

 #include "paddle_mobile_jni.h"
 #include <cmath>
+#include <string>
+#include <vector>
 #include "common/log.h"
 #include "framework/tensor.h"
 #include "io/paddle_mobile.h"

 #ifdef ENABLE_EXCEPTION
-
 #include "common/enforce.h"
-
 #endif

 #ifdef __cplusplus
 extern "C" {
 #endif
+
 namespace paddle_mobile {
 namespace jni {
+
 using framework::DDim;
 using framework::Program;
 using framework::Tensor;
@@ -200,7 +202,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
    for (int i = 0; i < length; i++) {
      input_ptr[i] = dataPointer[i];
    }
-    auto output = getPaddleMobileInstance()->Predict(input);
+    getPaddleMobileInstance()->Predict(input);
+    auto output = getPaddleMobileInstance()->Fetch();
    count = output->numel();
    result = env->NewFloatArray(count);
    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -233,7 +236,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
  for (int i = 0; i < length; i++) {
    input_ptr[i] = dataPointer[i];
  }
-  auto output = getPaddleMobileInstance()->Predict(input);
+  getPaddleMobileInstance()->Predict(input);
+  auto output = getPaddleMobileInstance()->Fetch();
  count = output->numel();
  result = env->NewFloatArray(count);
  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -328,7 +332,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
    for (int i = 0; i < length; i++) {
      input_ptr[i] = matrix[i];
    }
-    auto output = getPaddleMobileInstance()->Predict(input);
+    getPaddleMobileInstance()->Predict(input);
+    auto output = getPaddleMobileInstance()->Fetch();
    count = output->numel();
    result = env->NewFloatArray(count);
    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -363,7 +368,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
  for (int i = 0; i < length; i++) {
    input_ptr[i] = matrix[i];
  }
-  auto output = getPaddleMobileInstance()->Predict(input);
+  getPaddleMobileInstance()->Predict(input);
+  auto output = getPaddleMobileInstance()->Fetch();
  count = output->numel();
  result = env->NewFloatArray(count);
  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -399,7 +405,8 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
  auto *pdata = words.mutable_data<int64_t>();
  size_t n = words.numel() * sizeof(int64_t);
  memcpy(pdata, ids.data(), n);
-  auto vec_result = paddle_mobile.PredictLod(words);
+  paddle_mobile.Predict(words);
+  auto vec_result = paddle_mobile.Fetch();
  int count = vec_result->numel();
  jlongArray result = NULL;
  ANDROIDLOGE("predict nlp size %d", count);

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -13,81 +13,81 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "io/paddle_mobile.h"
+#include <utility>
+#include "common/common.h"
 #ifdef PADDLE_MOBILE_CL
 #include <CL/cl.h>
 #include "framework/cl/cl_tensor.h"
 #endif
-#include "common/common.h"
 #include "operators/math/gemm.h"
+
 namespace paddle_mobile {

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::SetThreadNum(int num) {
 #ifdef _OPENMP
  omp_set_num_threads(num);
 #endif
 }

-template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
-                                  bool quantification, int batch_size,
-                                  bool loddable) {
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
+                                       bool optimize, bool quantification,
+                                       int batch_size, bool loddable) {
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }

  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->Load(dirname, optimize, quantification), batch_size, optimize,
        loddable);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

-  return true;
+  return PMSuccess;
 }

-template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
-                                  const std::string &para_path, bool optimize,
-                                  bool quantification, int batch_size,
-                                  bool loddable) {
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
+                                       const std::string &para_path,
+                                       bool optimize, bool quantification,
+                                       int batch_size, bool loddable) {
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }

  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->Load(model_path, para_path, optimize, quantification),
        batch_size, optimize, loddable);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

-  return true;
+  return PMSuccess;
 }

-template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
-                                                const uint8_t *model_buf,
-                                                size_t combined_params_len,
-                                                uint8_t *combined_params_buf) {
+template <typename Device, typename T>
+bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
+                                                 const uint8_t *model_buf,
+                                                 size_t combined_params_len,
+                                                 uint8_t *combined_params_buf) {
  int batch_size = 1;
  bool optimise = true;
  bool quantification = false;
-
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }
-
  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                    combined_params_buf, optimise,
                                    quantification),
@@ -96,38 +96,76 @@ bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
    LOG(kLOG_INFO) << "executor inited";
  }

-  return true;
+  return PMSuccess;
+}
+
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(const framework::Tensor &input) {
+  std::vector<std::pair<std::string, framework::Tensor>> inputs;
+  inputs.push_back(std::make_pair("feed", input));
+  return this->Predict(inputs);
 }
-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict(
-    const framework::Tensor &t) {
-  return executor_->Predict(t);
+
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(const framework::LoDTensor &input) {
+  std::vector<std::pair<std::string, framework::LoDTensor>> inputs;
+  inputs.push_back(std::make_pair("feed", input));
+  return this->Predict(inputs);
+}
+
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(
+    const std::vector<std::pair<std::string, framework::Tensor>> &inputs) {
+  return executor_->Predict(inputs);
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::PredictLod(
-    const framework::LoDTensor &t) {
-  return executor_->PredictLod(t);
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(
+    const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs) {
+  return executor_->Predict(inputs);
 }

-template <typename Dtype, Precision P>
-std::vector<typename PaddleMobile<Dtype, P>::Ptype>
-PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input,
-                                const std::vector<int64_t> &dims) {
+template <typename Device, typename T>
+std::vector<T> PaddleMobile<Device, T>::Predict(
+    const std::vector<T> &input, const std::vector<int64_t> &dims) {
  return executor_->Predict(input, dims);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Clear() {
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict() {
+  return executor_->Predict();
+}
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Feed(const framework::Tensor &input,
+                                   const std::string &var_name) {
+  executor_->SetInput(input, var_name);
+}
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Feed(const framework::LoDTensor &input,
+                                   const std::string &var_name) {
+  executor_->SetInput(input, var_name);
+}
+
+typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
+template <typename Device, typename T>
+LoDTensorPtr PaddleMobile<Device, T>::Fetch(const std::string &var_name) {
+  return executor_->GetOutput(var_name);
+}
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Clear() {
  executor_ = nullptr;
  loader_ = nullptr;
 }
-template <typename Dtype, Precision P>
-double PaddleMobile<Dtype, P>::GetPredictTime() {}
+
+template <typename Device, typename T>
+double PaddleMobile<Device, T>::GetPredictTime() {}

 #ifdef PADDLE_MOBILE_CPU
 template <>
-double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
+double PaddleMobile<CPU, float>::GetPredictTime() {
  int m = 32;
  int n = 224 * 224;
  int k = 27;
@@ -148,7 +186,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
  for (int i = 0; i < k * n; ++i) {
    b[i] = t1 + rand() % t2;  // NOLINT
  }
-  paddle_mobile::operators::math::Gemm gemm;
+
+  operators::math::Gemm gemm;
  auto time1 = paddle_mobile::time();
  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
             static_cast<float>(0), c, ldc, false,
@@ -162,57 +201,51 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
 }
 #endif

-template <typename Dtype, Precision P>
-PaddleMobile<Dtype, P>::~PaddleMobile() {
-  executor_ = nullptr;
-  loader_ = nullptr;
-}
-
 #ifdef PADDLE_MOBILE_FPGA
-
-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
-                                            std::string var_name) {
+template <typename Device, T P>
+void PaddleMobile<Device, P>::InjectVariable(const framework::Tensor &t,
+                                             std::string var_name) {
  executor_->InjectVariable(t, var_name);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
+template <typename Device, T P>
+void PaddleMobile<Device, P>::FeedData(const framework::Tensor &t) {
  executor_->FeedData(t);
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
+template <typename Device, T P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Device, P>::FetchResult(
+    int id) {
  return executor_->FetchResult(id);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
+template <typename Device, T P>
+void PaddleMobile<Device, P>::Predict_From_To(int start, int end) {
  executor_->Predict_From_To(start, end);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Predict_From(int start) {
+template <typename Device, T P>
+void PaddleMobile<Device, P>::Predict_From(int start) {
  executor_->Predict_From(start);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Predict_To(int end) {
+template <typename Device, T P>
+void PaddleMobile<Device, P>::Predict_To(int end) {
  executor_->Predict_To(end);
 }
 #endif

 #ifdef PADDLE_MOBILE_CL
 static std::mutex lc;
-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
+template <typename Device, T P>
+void PaddleMobile<Device, P>::SetCLPath(std::string path) {
  std::lock_guard<std::mutex> lock(lc);
  if (framework::CLEngine::Instance()->GetCLPath() == "") {
    framework::CLEngine::Instance()->setClPath(path);
  }
 }
 template <>
-double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
+double PaddleMobile<GPU_CL, T::FP32>::GetPredictTime() {
  cl_int status;
  cl_uint nPlatform;
  clGetPlatformIDs(0, NULL, &nPlatform);
@@ -410,8 +443,8 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
    return -1;
  }
 }
-template <typename Dtype, Precision P>
-int PaddleMobile<Dtype, P>::readText(
+template <typename Device, T P>
+int PaddleMobile<Device, P>::readText(
    const char *kernelPath,
    char **pcode) {  // 读取文本文件放入 pcode，返回字符串长度
  FILE *fp;
@@ -440,13 +473,11 @@ int PaddleMobile<Dtype, P>::readText(
  fclose(fp);
  return size + 1;
 }
-
 #endif

-template class PaddleMobile<CPU, Precision::FP32>;
-template class PaddleMobile<FPGA, Precision::FP32>;
-template class PaddleMobile<GPU_MALI, Precision::FP32>;
-
-template class PaddleMobile<GPU_CL, Precision::FP32>;
+template class PaddleMobile<CPU, float>;
+template class PaddleMobile<FPGA, float>;
+template class PaddleMobile<GPU_MALI, float>;
+template class PaddleMobile<GPU_CL, float>;

 }  // namespace paddle_mobile
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #ifdef _OPENMP
 #include <omp.h>
@@ -32,43 +33,52 @@ limitations under the License. */

 namespace paddle_mobile {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device, typename T = float>
 class PaddleMobile {
-  typedef typename PrecisionTrait<P>::ptype Ptype;
-
 public:
  PaddleMobile() {
 #ifndef PADDLE_MOBILE_CL
-    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Dtype>::value;
-    PADDLE_MOBILE_ENFORCE(!is_gpu,
-                          "Not Enable GPU in CmakeList but run gpu codes ");
+    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
+    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
 #endif
  }
-  bool Load(const std::string &dirname, bool optimize = false,
-            bool quantification = false, int batch_size = 1,
-            bool loddable = false);
+  ~PaddleMobile() {}
+
+  PMStatus Load(const std::string &dirname, const bool optimize = false,
+                const bool quantification = false, const int batch_size = 1,
+                const bool lod = false);
+  PMStatus Load(const std::string &model_path, const std::string &para_path,
+                const bool optimize = false, const bool quantification = false,
+                const int batch_size = 1, const bool lod = false);
+
+  PMStatus Predict(const framework::Tensor &input);
+  PMStatus Predict(const framework::LoDTensor &input);

-  bool Load(const std::string &model_path, const std::string &para_path,
-            bool optimize = false, bool quantification = false,
-            int batch_size = 1, bool loddable = false);
+  PMStatus Predict(
+      const std::vector<std::pair<std::string, framework::Tensor>> &inputs);
+  PMStatus Predict(
+      const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs);

-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
+  std::vector<T> Predict(const std::vector<T> &input,
+                         const std::vector<int64_t> &dims);
+  PMStatus Predict();

-  std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t);
+  void Feed(const framework::LoDTensor &input, const std::string &var_name);
+  void Feed(const framework::Tensor &input, const std::string &var_name);

-  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
-                             const std::vector<int64_t> &dims);
+  typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
+  LoDTensorPtr Fetch(const std::string &var_name);
+
+  LoDTensorPtr Fetch() { return Fetch("fetch"); }

  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                          size_t combined_params_len,
                          uint8_t *combined_params_buf);

-  void SetThreadNum(int num);
+  void SetThreadNum(int count);
  void Clear();
  double GetPredictTime();

-  ~PaddleMobile();
-
 #ifdef PADDLE_MOBILE_FPGA
  void InjectVariable(const framework::Tensor &t, std::string var_name);
  void FeedData(const framework::Tensor &t);
@@ -79,15 +89,15 @@ class PaddleMobile {
 #endif

 #ifdef PADDLE_MOBILE_CL
- public:
+ public:  // NOLINT
  void SetCLPath(std::string cl_path);
  int readText(const char *kernelPath,
               char **pcode);  // 读取文本文件放入 pcode，返回字符串长度
 #endif

 private:
-  std::shared_ptr<framework::Loader<Dtype, P>> loader_;
-  std::shared_ptr<framework::Executor<Dtype, P>> executor_;
+  std::shared_ptr<framework::Loader<Device, T>> loader_;
+  std::shared_ptr<framework::Executor<Device, T>> executor_;
 };

 }  // namespace paddle_mobile
--- a/src/io/paddle_test_inference_api.cpp
+++ b/src/io/paddle_test_inference_api.cpp
@@ -14,10 +14,12 @@ limitations under the License. */

 #include "io/paddle_test_inference_api.h"
 #include "io/paddle_mobile.h"
+
 namespace paddle_mobile {
-template <typename Dtype, Precision P>
-double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
-  PaddleMobile<Dtype, P> paddle_mobile;
+
+template <typename Device, typename T>
+double PaddleTester<Device, T>::CaculatePredictTime(std::string *cl_path) {
+  PaddleMobile<Device, T> paddle_mobile;
 #ifdef PADDLE_MOBILE_CL
  if (cl_path) {
    paddle_mobile.SetCLPath(*cl_path);
@@ -26,10 +28,10 @@ double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
 #endif
  return paddle_mobile.GetPredictTime();
 }
-template class PaddleTester<CPU, Precision::FP32>;
-template class PaddleTester<FPGA, Precision::FP32>;
-template class PaddleTester<GPU_MALI, Precision::FP32>;
+template class PaddleTester<CPU, float>;
+template class PaddleTester<FPGA, float>;
+template class PaddleTester<GPU_MALI, float>;

-template class PaddleTester<GPU_CL, Precision::FP32>;
+template class PaddleTester<GPU_CL, float>;

 }  // namespace paddle_mobile
--- a/src/io/paddle_test_inference_api.h
+++ b/src/io/paddle_test_inference_api.h
@@ -20,10 +20,13 @@ limitations under the License. */
 */

 #pragma once
+
 #include "common/types.h"
 #include "string"
+
 namespace paddle_mobile {
-template <typename Dtype, Precision P = Precision::FP32>
+
+template <typename Device, typename T = float>
 class PaddleTester {
 public:
  double CaculatePredictTime(std::string *cl_path = nullptr);

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -375,5 +375,8 @@ if (NOT FOUND_MATCH)
    # gen test
    ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
    target_link_libraries(test-super paddle-mobile)
-    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
+   
+    # gen test
+    ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
+    target_link_libraries(test-ocr paddle-mobile)
 endif ()
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -39,6 +39,7 @@ using paddle_mobile::framework::Tensor;
 using paddle_mobile::framework::Variable;
 using std::string;
 using std::vector;
+
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
@@ -48,20 +49,19 @@ class Executor4Test : public Executor<DeviceType> {
    this->use_optimize_ = use_optimize;
    this->program_ = p;
    if (this->use_optimize_) {
-      this->to_predict_program_ = this->program_.optimizeProgram;
+      this->program_desc_ = this->program_.optimizeProgram;
    } else {
-      this->to_predict_program_ = this->program_.originProgram;
+      this->program_desc_ = this->program_.originProgram;
    }

    if (this->program_.originProgram == nullptr) {
-      LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-          << "to_predict_program_ == nullptr";
+      LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "program_desc_ == nullptr";
    }

    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        this->to_predict_program_->Blocks();
-    for (std::shared_ptr<BlockDesc> block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+        this->program_desc_->Blocks();
+    for (int block_id = 0; block_id < blocks.size(); ++block_id) {
+      std::vector<std::shared_ptr<OpDesc>> ops = blocks[block_id]->Ops();
      for (int i = 0; i < ops.size(); ++i) {
        auto op = ops[i];
        if (op->Type() == op_type) {
@@ -73,18 +73,16 @@ class Executor4Test : public Executor<DeviceType> {
                  paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
                      op->Type(), op->GetInputs(), op->GetOutputs(),
                      op->GetAttrMap(), this->program_.scope);
-          this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          this->ops_of_block_[block_id].push_back(op_ptr);
          break;
        }
      }
    }
    this->InitMemory();
-
-    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
-        this->to_predict_program_->Block(0);
-    auto &ops = this->ops_of_block_[*to_predict_block.get()];
-    for (const auto &op : ops) {
-      op->Init();
+    for (const auto &ops : this->ops_of_block_) {
+      for (const auto &op : ops) {
+        op->Init();
+      }
    }
  }

@@ -117,12 +115,10 @@ class Executor4Test : public Executor<DeviceType> {
      output_tensor_sptrs[i].reset(output_tensors[i]);
    }

-    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
-        this->to_predict_program_->Block(0);
-    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
-         ++j) {
-      auto op = this->ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
+    for (auto &ops : this->ops_of_block_) {
+      for (auto &op : ops) {
+        op->Run();
+      }
    }

    return output_tensor_sptrs;
@@ -139,14 +135,11 @@ class Executor4Test : public Executor<DeviceType> {
    auto *output_tensor = con_output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>(dDim);

-    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
-        this->to_predict_program_->Block(0);
-    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
-         ++j) {
-      auto op = this->ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
+    for (auto &ops : this->ops_of_block_) {
+      for (auto &op : ops) {
+        op->Run();
+      }
    }
-
    return std::make_shared<paddle_mobile::framework::Tensor>(
        paddle_mobile::framework::Tensor(*output_tensor));
  }

--- a/test/net/test_benchmark.cpp
+++ b/test/net/test_benchmark.cpp
@@ -52,15 +52,16 @@ int main(int argc, char* argv[]) {
    SetupTensor<float>(&input, in_shape, 0.f, 255.f);
    // warmup
    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input);
+      paddle_mobile.Predict(input);
    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input);
+      paddle_mobile.Predict(input);
    }
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n";
    std::ostringstream os("output tensor size: ");
+    output = paddle_mobile.Fetch();
    os << output->numel() << "\n" << output->data<float>()[0];
    for (int i = 1; i < output->numel(); ++i) {
      os << ", " << output->data<float>()[i];

--- a/test/net/test_eng.cpp
+++ b/test/net/test_eng.cpp
@@ -36,11 +36,11 @@ int main() {
                             input_tensor.data<float>() + input_tensor.numel());
    //   预热十次
    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.PredictLod(input_tensor);
+      paddle_mobile.Predict(input_tensor);
    }
    auto time3 = time();
    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.PredictLod(input_tensor);
+      paddle_mobile.Predict(input_tensor);
    }
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -41,12 +41,12 @@ int main(int argc, char* argv[]) {
 #endif
  paddle_mobile.SetThreadNum(thread_num);
  auto time1 = time();
-  if (paddle_mobile.Load(g_googlenet, optimize)) {
+  std::vector<float> output;
+  if (paddle_mobile.Load(g_googlenet, optimize, false, 1, true)) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
              << std::endl;
    std::vector<float> input;
-    std::vector<float> output;
    std::vector<int64_t> dims{1, 3, 224, 224};
    if (feed_shape) {
      sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]);

--- a/test/net/test_nlp.cpp
+++ b/test/net/test_nlp.cpp
@@ -48,8 +48,8 @@ int main() {
    DLOG << "words lod 22: " << words.lod();
    auto time3 = time();
    for (int i = 0; i < 1; ++i) {
-      auto vec_result = paddle_mobile.PredictLod(words);
-      DLOG << *vec_result;
+      paddle_mobile.Predict(words);
+      DLOG << *paddle_mobile.Fetch();
    }
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
@@ -84,8 +84,8 @@ int main() {
  DLOG << "words lod 22: " << words.lod();
  auto time3 = time();
  for (int i = 0; i < 1; ++i) {
-    auto vec_result = paddle_mobile.PredictLod(words);
-    DLOG << *vec_result;
+    paddle_mobile.Predict(words);
+    DLOG << *paddle_mobile.Fetch();
  }
  auto time4 = time();
  std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"

--- a/test/net/test_ocr.cpp
+++ b/test/net/test_ocr.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+void load_images(const char *image_dir, const char *images_list,
+                 std::vector<std::string> *image_names,
+                 std::vector<std::pair<int, int>> *image_shapes) {
+  int height, width;
+  std::string filename;
+  std::ifstream if_list(images_list, std::ios::in);
+  while (!if_list.eof()) {
+    if_list >> height >> width >> filename;
+    image_shapes->push_back(std::make_pair(height, width));
+    image_names->push_back(filename);
+  }
+}
+
+int main(int argc, char **argv) {
+  if (argc < 4) {
+    std::cerr << "Usage: ./test_ocr model_dir image_dir images_list."
+              << std::endl;
+    return 1;
+  }
+  char *model_dir = argv[1];
+  char *image_dir = argv[2];
+  char *images_list = argv[3];
+
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(8);
+  auto isok = paddle_mobile.Load(std::string(model_dir) + "/model",
+                                 std::string(model_dir) + "/params", true,
+                                 false, 1, true);
+  DLOG << "pass init model";
+  std::vector<std::string> image_names;
+  std::vector<std::pair<int, int>> image_shapes;
+  load_images(image_dir, images_list, &image_names, &image_shapes);
+  DLOG << "pass load images";
+
+  for (int i = 0; i < image_names.size(); i++) {
+    std::string file_name = image_names[i];
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 1, 48, 512};
+    dims[2] = image_shapes[i].first;
+    dims[3] = image_shapes[i].second;
+    // load input image
+    std::string img_path = std::string(image_dir) + "/" + file_name;
+    std::cerr << "img_path: " << img_path << std::endl;
+    std::cerr << "shape = [" << dims[0] << ", " << dims[1] << ", " << dims[2]
+              << ", " << dims[3] << "]" << std::endl;
+    GetInput<float>(img_path, &input, dims);
+    // predict
+    auto output = paddle_mobile.Predict(input, dims);
+    // print result
+    std::cerr << file_name << std::endl;
+    std::cerr << output[0];
+    for (int j = 1; j < output.size(); ++j) {
+      std::cerr << " " << output[j];
+    }
+    std::cerr << std::endl;
+  }
+  return 0;
+}
--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "paddle_mobile_jni.cpp"); do
    cpplint $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done