Merge pull request #1495 from hjchen2/backup

支持attention模型、重构sgemm和depthwise conv3x3、实现winograd和depthwise conv5x5 v8版本

Merge pull request #1495 from hjchen2/backup
支持attention模型、重构sgemm和depthwise conv3x3、实现winograd和depthwise conv5x5 v8版本
ad3844d6 · Houjiang Chen · GitHub · b4bc6aac · 8fc811b2 · ad3844d6
265 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)

-set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS} -Wno-attributes")
 if(IS_IOS)
    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
        -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")

--- a/src/common/log.h
+++ b/src/common/log.h
@@ -31,7 +31,8 @@ namespace paddle_mobile {

 #ifdef ANDROID

-extern const char *ANDROID_LOG_TAG;
+static const char *ANDROID_LOG_TAG =
+    "paddle_mobile LOG built on " __DATE__ " " __TIME__;

 #define ANDROIDLOGI(...)                                               \
  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \

--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -37,8 +37,7 @@ template <typename Dtype>
 using OpCreator = std::function<framework::OperatorBase<Dtype> *(
    const std::string & /*type*/, const VariableNameMap & /*inputs*/,
    const VariableNameMap & /*outputs*/,
-    const framework::AttributeMap & /*attrs*/,
-    std::shared_ptr<framework::Scope> /*scope*/)>;
+    const framework::AttributeMap & /*attrs*/, framework::Scope * /*scope*/)>;

 using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
                                          framework::BlockDesc * /*block*/)>;

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -205,6 +205,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;

+extern const char *G_OP_TYPE_PAD2D;
+
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key;

--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -91,7 +91,14 @@ class Attribute {
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK: {
-        attr.Set<int>(attr_desc->block_idx);
+        break;
+      }
+      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS: {
+        vector<int> val(attr_desc->n_longs);
+        for (int i = 0; i < attr_desc->n_longs; ++i) {
+          val[i] = attr_desc->longs[i];
+        }
+        attr.Set<vector<int>>(val);
        break;
      }
      default:
@@ -139,6 +146,14 @@ class Attribute {
      return vistor(attr.variant_.Get<vector<bool>>());
    } else if (attr.variant_.TypeId() == typeid(int64_t).hash_code()) {
      return vistor(attr.variant_.Get<int64_t>());
+    } else if (attr.variant_.TypeId() ==
+               typeid(framework::BlockDesc *).hash_code()) {
+      return vistor(attr.variant_.Get<framework::BlockDesc *>());
+    } else if (attr.variant_.TypeId() ==
+               typeid(vector<framework::BlockDesc *>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<framework::BlockDesc *>>());
+    } else if (attr.variant_.TypeId() == typeid(vector<int64_t>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<int64_t>>());
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
    }
@@ -146,7 +161,8 @@ class Attribute {

 private:
  Variant<int, float, string, vector<int>, vector<float>, vector<string>, bool,
-          vector<bool>, BlockDesc *, int64_t>
+          vector<bool>, BlockDesc *, vector<BlockDesc *>, int64_t,
+          vector<int64_t>>
      variant_;
 };


--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -42,6 +42,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
  }
+  return DataLayout::kNCHW;
 }

 inline std::string DataLayoutToString(const DataLayout &data_layout) {

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -82,6 +82,8 @@ struct Dim<0> {
  int64_t &operator[](int idx);

  int64_t operator[](int idx) const;
+
+  int64_t head;
 };

 namespace {
@@ -131,6 +133,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  return dim.head;
 }

 template <int D>
@@ -147,6 +150,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  return dim.head;
 }

 }  // namespace

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -57,32 +57,30 @@ Executor<Device, T>::Executor(const Program<Device> &program,
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
-  ops_of_block_.resize(blocks.size());
-
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<OpDesc> op_desc = ops[j];
-      DLOG << "create op: " << op_desc->Type();
-
-      auto op_handler = OpRegistry<Device>::CreateOp(
-          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-          op_desc->GetAttrMap(), program_.scope);
-      // infer shape to reshape inputs and outputs before predict,
-      // but for lod mode, it still need to infer shape in runtime
-      if (!lod_mode) {
-        op_handler->InferShape();
-      }
-      ops_of_block_[i].push_back(op_handler);
+
+  std::shared_ptr<BlockDesc> block_desc = blocks[0];
+  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+  for (int j = 0; j < ops.size(); ++j) {
+    std::shared_ptr<OpDesc> op_desc = ops[j];
+    DLOG << "create op: " << op_desc->Type();
+
+    auto op_handler = OpRegistry<Device>::CreateOp(
+        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
+        op_desc->GetAttrMap(), program_.scope.get());
+    // infer shape to reshape inputs and outputs before predict,
+    // but for lod mode, it still need to infer shape in runtime
+    if (!lod_mode) {
+      op_handler->InferShape();
    }
+    ops_of_block0_.push_back(op_handler);
  }
-
  if (program_.combined) {
    InitCombineMemory();
  } else {
    InitMemory();
  }
+  // resize feed and fetch list
+  InitFeedFetchList();

 #ifdef PADDLE_MOBILE_FPGA
  program_.scope->EraseVars({"feed", "fetch"});
@@ -90,13 +88,37 @@ Executor<Device, T>::Executor(const Program<Device> &program,
 #endif

  int count = 0;
-  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
-    for (auto &op_handler : ops_of_block_[block_id]) {
-      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
-      op_handler->Init();
-      ops_list_.push_back(op_handler);
+  for (auto &op_handler : ops_of_block0_) {
+    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
+    op_handler->Init();
+  }
+}
+
+template <typename Device, typename T>
+void Executor<Device, T>::InitFeedFetchList() {
+  std::unordered_map<std::string, int> feed_indices, fetch_indices;
+  for (const auto &block : program_desc_->Blocks()) {
+    for (const auto &op_desc : block->Ops()) {
+      if (op_desc->Type() == "feed") {
+        std::string name = op_desc->Output("Out")[0];
+        feed_indices[name] = op_desc->GetAttr("col").Get<int>();
+      } else if (op_desc->Type() == "fetch") {
+        std::string name = op_desc->Input("X")[0];
+        fetch_indices[name] = op_desc->GetAttr("col").Get<int>();
+      }
    }
  }
+  feed_indices_.swap(feed_indices);
+  fetch_indices_.swap(fetch_indices);
+
+  auto *feed_var = program_.scope->Var("feed");
+  auto *feed_list = feed_var->template GetMutable<framework::LoDTensorArray>();
+  feed_list->resize(feed_indices_.size());
+
+  auto *fetch_var = program_.scope->Var("fetch");
+  auto *fetch_list =
+      fetch_var->template GetMutable<framework::LoDTensorArray>();
+  fetch_list->resize(fetch_indices_.size());
 }

 template <typename T>
@@ -181,20 +203,20 @@ void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
-      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensorArray>();
          continue;
        }
        char *origin_data =
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
        char *data = origin_data;
+        auto tensor = var->template GetMutable<LoDTensor>();
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
      } else {
-        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-          varInputMemory(var_desc, var, tensor);
-        }
+        DLOG << "init no persistable var: " << var_desc->Name();
+        varInputMemory(var_desc, var);
      }
    }
  }
@@ -216,23 +238,18 @@ void Executor<Device, T>::InitCombineMemory() {
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
-      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensorArray>();
          continue;
        }

        DLOG << " init combine memory persistable: " << var_desc->Name();
-
+        auto tensor = var->template GetMutable<LoDTensor>();
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
      } else {
-        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-          DLOG << " init combine memory no persistable in lod: "
-               << var_desc->Name();
-          varInputMemory(var_desc, var, tensor);
-        } else {
-          DLOG << " init combine memory no persistable: " << var_desc->Name();
-        }
+        DLOG << " init combine memory no persistable: " << var_desc->Name();
+        varInputMemory(var_desc, var);
      }
    }
  }
@@ -250,6 +267,7 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensorArray>();
          continue;
        }
      } else {
@@ -260,6 +278,9 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
          tensor->template mutable_data<T>();
+        } else {
+          PADDLE_MOBILE_THROW_EXCEPTION("Unsupported var type `%d`",
+                                        var_desc->Type());
        }
      }
    }
@@ -272,34 +293,44 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {

 template <typename Device, typename T>
 bool Executor<Device, T>::varInputMemory(
-    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
-    LoDTensor *tensor) const {
+    const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
 #ifdef PADDLE_MOBILE_FPGA
+  framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
  tensor->init(typeid(float));
  return true;
 #endif
-  auto type = var_desc->Tensor_desc().DataType();
-  switch (type) {
-    case VARTYPE_TYPE_FP32:
-      tensor->mutable_data<float>();
-      break;
-    case VARTYPE_TYPE_INT8:
-      tensor->mutable_data<int8_t>();
-      break;
-    case VARTYPE_TYPE_INT32:
-      tensor->mutable_data<int32_t>();
-      break;
-    case VARTYPE_TYPE_INT64:
-      tensor->mutable_data<int64_t>();
-      break;
-    default:
-      break;
+  auto TypeId = [](const VarType_Type &type) -> std::type_index {
+    switch (type) {
+      case VARTYPE_TYPE_BOOL:
+        return typeid(bool);
+      case VARTYPE_TYPE_FP32:
+        return typeid(float);
+      case VARTYPE_TYPE_INT8:
+        return typeid(int8_t);
+      case VARTYPE_TYPE_INT32:
+        return typeid(int);
+      case VARTYPE_TYPE_INT64:
+        return typeid(int64_t);
+      default:
+        PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
+    }
+  };
+
+  auto type = var_desc->Type();
+  if (type == VARTYPE_TYPE_LOD_TENSOR) {
+    auto data_type = var_desc->Tensor_desc().DataType();
+    framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
+    tensor->mutable_data(TypeId(data_type));
+  } else if (type == VARTYPE_TYPE_STEP_SCOPES) {
+    std::vector<framework::Scope *> *step_scopes =
+        var->template GetMutable<std::vector<framework::Scope *>>();
+  } else if (type == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
+    framework::LoDTensorArray *tensor_array =
+        var->template GetMutable<framework::LoDTensorArray>();
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
  }
-  bool is_mute_match =
-      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
-      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
-  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
-  return is_mute_match;
+  return true;
 }

 template <typename Device, typename T>
@@ -323,11 +354,19 @@ PMStatus Executor<Device, T>::Predict(
 template <typename Device, typename T>
 std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
+  PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0,
+                        "We don't know which tensor should be assign, since no "
+                        "feed op found in this model");
+  PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0,
+                        "We don't know which tensor should be fetch out, since "
+                        "no fetch op found in this model");
+  std::string input_name = feed_indices_.begin()->first;
  Tensor feed_tensor(input, make_ddim(dims));
-  SetInput(feed_tensor, "feed");
+  SetInput(feed_tensor, input_name);
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
-    const auto output_tensor = GetOutput("fetch");
+    std::string output_name = fetch_indices_.begin()->first;
+    const auto output_tensor = GetOutput(output_name);
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
@@ -338,11 +377,13 @@ std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
 template <typename Device, typename T>
 void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
-  auto *target_var = program_.scope->FindVar(var_name);
-  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
-                        var_name.c_str());
-
-  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  int index = 0;
+  if (feed_indices_.find(var_name) != feed_indices_.end()) {
+    index = feed_indices_.find(var_name)->second;
+  }
+  auto *feed_var = program_.scope->Var("feed");
+  framework::LoDTensor &target =
+      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);

  if (config_.load_when_predict) {
    if (input_dim_last_ != input.dims()) {
@@ -351,68 +392,92 @@ void Executor<Device, T>::SetInput(const Tensor &input,
    }
  }

-  target_tensor->Resize(input.dims());
-  target_tensor->ShareDataWith(input);
+  target.Resize(input.dims());
+  target.ShareDataWith(input);
 }

 template <typename Device, typename T>
 void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
-  auto *target_var = program_.scope->FindVar(var_name);
-  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
-                        var_name.c_str());
-  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  int index = 0;
+  if (feed_indices_.find(var_name) != feed_indices_.end()) {
+    index = feed_indices_.find(var_name)->second;
+  }
+  auto *feed_var = program_.scope->Var("feed");
+  framework::LoDTensor &target =
+      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);

  if (config_.load_when_predict) {
    if (input_dim_last_ != input.dims()) {
-      InitNoPersistableMemory(*target_tensor);
+      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
    }
  }

-  target_tensor->Resize(input.dims());
-  target_tensor->ShareDataWith(input);
-  target_tensor->set_lod(input.lod());
+  target.Resize(input.dims());
+  target.ShareDataWith(input);
+  target.set_lod(input.lod());
+}
+
+template <typename Device, typename T>
+std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
+    const std::string &var_name) {
+  const auto &iter = fetch_indices_.find(var_name);
+  if (var_name == "fetch" || iter != fetch_indices_.end()) {
+    int index = 0;
+    if (iter != fetch_indices_.end()) {
+      index = iter->second;
+    }
+    auto *fetch_var = program_.scope->Var("fetch");
+    framework::LoDTensor &target =
+        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(index);
+
+    return std::make_shared<LoDTensor>(target);
+  } else {
+    auto *fetch_var = program_.scope->Var(var_name);
+    framework::LoDTensor *target =
+        fetch_var->template GetMutable<framework::LoDTensor>();
+    return std::make_shared<LoDTensor>(*target);
+  }
 }

 template <typename Device, typename T>
 PMStatus Executor<Device, T>::Predict() {
 #ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops_list_.size());
+  std::vector<ProfInfo> profile(ops_of_block0_.size());
  struct timespec ts;
  int op_index = 0;
 #endif
-  for (auto &block : ops_of_block_) {
-    for (auto &op_handler : block) {
+  for (auto &op_handler : ops_of_block0_) {
 #ifdef PADDLE_MOBILE_PROFILE
-      clock_gettime(CLOCK_MONOTONIC, &ts);
-      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
-      if (lod_mode_) {
-        op_handler->InferShape();
-      }
-      op_handler->Run();
+    if (lod_mode_) {
+      op_handler->InferShape();
+    }
+    op_handler->Run();
 #ifdef PADDLE_MOBILE_PROFILE
-      clock_gettime(CLOCK_MONOTONIC, &ts);
-      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-      ++op_index;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+    ++op_index;
 #endif
-    }
  }
 #ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    if (ops_list_[i]->Type() == "conv2d" ||
-        ops_list_[i]->Type() == "depthwise_conv2d") {
-      auto inputs = ops_list_[i]->Inputs();
+    if (ops_of_block0_[i]->Type() == "conv2d" ||
+        ops_of_block0_[i]->Type() == "depthwise_conv2d") {
+      auto inputs = ops_of_block0_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
      int kernel_size = filter->dims()[2];
-      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
+      _tp[ops_of_block0_[i]->Type() + "_" + std::to_string(kernel_size)] +=
+          timeCost;
    } else {
-      _tp[ops_list_[i]->Type()] += timeCost;
+      _tp[ops_of_block0_[i]->Type()] += timeCost;
    }
  }
  printf("====================[ profile ]======================\n");
@@ -437,16 +502,6 @@ PMStatus Executor<Device, T>::Predict() {
  return PMSuccess;
 }

-template <typename Device, typename T>
-std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
-    const std::string &var_name) {
-  auto *target_var = program_.scope->FindVar(var_name);
-  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
-                        var_name.c_str());
-  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
-  return std::make_shared<LoDTensor>(*output_tensor);
-}
-
 #ifdef PADDLE_MOBILE_FPGA
 template <typename Device, typename T>
 void Executor<Device, T>::InjectVariable(const Tensor &t,
@@ -476,20 +531,6 @@ void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
  }
 }

-template <typename Device, typename T>
-void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
-  auto input_size = v.size();
-  int index = 0;
-  auto vars = program_.scope->VarContain("feed", &index);
-  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
-                        "input data number not correct");
-  for (int i = 0; i < input_size; i++) {
-    auto var = program_.scope->Var("feed", i + index);
-    auto feed_tensor = var->template GetMutable<LoDTensor>();
-    feed_tensor->ShareDataWith(v[i]);
-  }
-}
-
 template <typename Device, typename T>
 void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  auto output_size = v->size();
@@ -524,11 +565,11 @@ framework::Tensor *Executor<Device, T>::GetTensorByName(
    const std::string &name) {
  auto var = program_.scope->Var(name);
  return var->template GetMutable<LoDTensor>();
-};
+}

 template <typename Device, typename T>
 std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
-  auto &ops = ops_of_block_[0];
+  auto &ops = ops_of_block0_;

  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
@@ -542,7 +583,7 @@ std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {

 template <typename Device, typename T>
 void Executor<Device, T>::Predict_From_To(int start, int end) {
-  auto &ops = ops_of_block_[0];
+  auto &ops = ops_of_block0_;
  end = end < 0 ? static_cast<int>(ops.size()) : end;
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -53,7 +53,6 @@ class Executor {
  void InjectVariable(const Tensor &t, std::string var_name);
  void FeedData(const Tensor &t);
  void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);

  void GetResults(std::vector<void *> *v);
  void GetTensorResults(std::vector<framework::Tensor *> *v);
@@ -68,8 +67,9 @@ class Executor {
 protected:
  Executor() = default;

-  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var,
-                      LoDTensor *tensor) const;
+  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc,
+                      Variable *var) const;
+  void InitFeedFetchList();
  void InitMemory();
  void InitCombineMemory();
  void InitNoPersistableMemory(const Tensor &input_tensor);
@@ -85,10 +85,9 @@ class Executor {
  PaddleMobileConfigInternal config_;
  Program<Device> program_;
  std::shared_ptr<ProgramDesc> program_desc_;
-  typedef std::shared_ptr<OperatorBase<Device>> OperatorBasePtr;
-  std::vector<std::vector<OperatorBasePtr>> ops_of_block_;
-  // operators list
-  std::vector<OperatorBasePtr> ops_list_;
+  std::vector<std::shared_ptr<OperatorBase<Device>>> ops_of_block0_;
+  std::unordered_map<std::string, int> feed_indices_;
+  std::unordered_map<std::string, int> fetch_indices_;

  // for super resoltion
  DDim input_dim_last_;

--- a/src/framework/framework.pb-c.c
+++ b/src/framework/framework.pb-c.c
@@ -13,13 +13,6 @@ void paddle_mobile__framework__proto__version__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__version__get_packed_size(
-    const PaddleMobile__Framework__Proto__Version *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__version__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__Version *
 paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -54,13 +47,6 @@ void paddle_mobile__framework__proto__op_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -95,13 +81,6 @@ void paddle_mobile__framework__proto__op_proto__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__OpProto *
 paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -162,13 +141,6 @@ void paddle_mobile__framework__proto__var_type__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__VarType *
 paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -191,13 +163,6 @@ void paddle_mobile__framework__proto__var_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__VarDesc *
 paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -220,13 +185,6 @@ void paddle_mobile__framework__proto__block_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__BlockDesc *
 paddle_mobile__framework__proto__block_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -248,13 +206,6 @@ void paddle_mobile__framework__proto__program_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__ProgramDesc *
 paddle_mobile__framework__proto__program_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -310,7 +261,7 @@ const ProtobufCMessageDescriptor
        NULL /* reserved[123] */
 };
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[13] = {
+    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[14] = {
        {
            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
            0, /* quantifier_offset */
@@ -405,6 +356,13 @@ static const ProtobufCFieldDescriptor
            NULL, NULL, 0, /* flags */
            0, NULL, NULL  /* reserved1,reserved2, etc */
        },
+        {
+            "longs", 15, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_longs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, longs), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
@@ -417,6 +375,7 @@ static const unsigned
        2,  /* field[2] = i */
        5,  /* field[5] = ints */
        11, /* field[11] = l */
+        13, /* field[13] = longs */
        0,  /* field[0] = name */
        4,  /* field[4] = s */
        7,  /* field[7] = strings */
@@ -424,7 +383,7 @@ static const unsigned
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 13}};
+        {1, 0}, {10, 8}, {0, 14}};
 const ProtobufCMessageDescriptor
    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -433,7 +392,7 @@ const ProtobufCMessageDescriptor
        "PaddleMobile__Framework__Proto__OpDesc__Attr",
        "paddle_mobile.framework.proto",
        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        13,
+        14,
        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
        2,
@@ -1448,7 +1407,7 @@ const ProtobufCMessageDescriptor
        NULL /* reserved[123] */
 };
 static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[11] = {
+    paddle_mobile__framework__proto__attr_type__enum_values_by_number[12] = {
        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
@@ -1460,15 +1419,16 @@ static const ProtobufCEnumValue
        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
        {"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10},
+        {"LONGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS", 11},
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 11}};
+                                                                  {0, 12}};
 static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[11] = {
+    paddle_mobile__framework__proto__attr_type__enum_values_by_name[12] = {
        {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7},
        {"FLOAT", 1}, {"FLOATS", 4},  {"INT", 0},     {"INTS", 3},
-        {"LONG", 9},  {"STRING", 2},  {"STRINGS", 5},
+        {"LONG", 9},  {"LONGS", 11},  {"STRING", 2},  {"STRINGS", 5},
 };
 const ProtobufCEnumDescriptor
    paddle_mobile__framework__proto__attr_type__descriptor = {
@@ -1477,9 +1437,9 @@ const ProtobufCEnumDescriptor
        "AttrType",
        "PaddleMobile__Framework__Proto__AttrType",
        "paddle_mobile.framework.proto",
-        11,
+        12,
        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        11,
+        12,
        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
        1,
        paddle_mobile__framework__proto__attr_type__value_ranges,

--- a/src/framework/framework.pb-c.h
+++ b/src/framework/framework.pb-c.h
@@ -102,8 +102,9 @@ typedef enum _PaddleMobile__Framework__Proto__AttrType {
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS =
-      10 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS = 10,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS =
+      11 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
 } PaddleMobile__Framework__Proto__AttrType;

@@ -152,13 +153,15 @@ struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
  int64_t l;
  size_t n_blocks_idx;
  int32_t *blocks_idx;
+  size_t n_longs;
+  int64_t *longs;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
  {                                                                            \
    PROTOBUF_C_MESSAGE_INIT(                                                   \
        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL          \
+        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL, 0, NULL \
  }

 struct _PaddleMobile__Framework__Proto__OpDesc__Var {
@@ -417,8 +420,6 @@ struct _PaddleMobile__Framework__Proto__ProgramDesc {
 /* PaddleMobile__Framework__Proto__Version methods */
 void paddle_mobile__framework__proto__version__init(
    PaddleMobile__Framework__Proto__Version *message);
-size_t paddle_mobile__framework__proto__version__get_packed_size(
-    const PaddleMobile__Framework__Proto__Version *message);
 PaddleMobile__Framework__Proto__Version *
 paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -435,8 +436,6 @@ void paddle_mobile__framework__proto__op_desc__var__init(
 /* PaddleMobile__Framework__Proto__OpDesc methods */
 void paddle_mobile__framework__proto__op_desc__init(
    PaddleMobile__Framework__Proto__OpDesc *message);
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message);
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -453,8 +452,6 @@ void paddle_mobile__framework__proto__op_proto__attr__init(
 /* PaddleMobile__Framework__Proto__OpProto methods */
 void paddle_mobile__framework__proto__op_proto__init(
    PaddleMobile__Framework__Proto__OpProto *message);
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message);
 PaddleMobile__Framework__Proto__OpProto *
 paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -483,8 +480,6 @@ void paddle_mobile__framework__proto__var_type__tuple__init(
 /* PaddleMobile__Framework__Proto__VarType methods */
 void paddle_mobile__framework__proto__var_type__init(
    PaddleMobile__Framework__Proto__VarType *message);
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message);
 PaddleMobile__Framework__Proto__VarType *
 paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -495,8 +490,6 @@ void paddle_mobile__framework__proto__var_type__free_unpacked(
 /* PaddleMobile__Framework__Proto__VarDesc methods */
 void paddle_mobile__framework__proto__var_desc__init(
    PaddleMobile__Framework__Proto__VarDesc *message);
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message);
 PaddleMobile__Framework__Proto__VarDesc *
 paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -507,8 +500,6 @@ void paddle_mobile__framework__proto__var_desc__free_unpacked(
 /* PaddleMobile__Framework__Proto__BlockDesc methods */
 void paddle_mobile__framework__proto__block_desc__init(
    PaddleMobile__Framework__Proto__BlockDesc *message);
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message);
 PaddleMobile__Framework__Proto__BlockDesc *
 paddle_mobile__framework__proto__block_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
@@ -518,8 +509,6 @@ void paddle_mobile__framework__proto__block_desc__free_unpacked(
 /* PaddleMobile__Framework__Proto__ProgramDesc methods */
 void paddle_mobile__framework__proto__program_desc__init(
    PaddleMobile__Framework__Proto__ProgramDesc *message);
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message);
 PaddleMobile__Framework__Proto__ProgramDesc *
 paddle_mobile__framework__proto__program_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);

--- a/src/framework/framework.proto
+++ b/src/framework/framework.proto
@@ -35,6 +35,7 @@ enum AttrType {
  BLOCK = 8;
  LONG = 9;
  BLOCKS = 10;
+  LONGS = 11;
 }

 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -55,6 +56,7 @@ message OpDesc {
    optional int32 block_idx = 12;
    optional int64 l = 13;
    repeated int32 blocks_idx = 14;
+    repeated int64 longs = 15;
  };

  message Var {

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU);
 LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_relu);
 #endif
-#ifdef FUSION_CONVADDADDPRELU_OP
-LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
-#endif
 #ifdef FUSION_CONVADD_OP
 LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
 LOAD_FUSION_MATCHER(fusion_conv_add);
@@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn);
 #ifdef DROPOUT_OP
 LOAD_OP2(dropout, CPU, FPGA);
 #endif
-#ifdef FUSION_CONVADDPRELU_OP
-LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
-#endif
 #ifdef FUSION_DWCONVBNRELU_OP
 LOAD_OP1(fusion_dwconv_bn_relu, CPU);
 LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
@@ -324,3 +316,15 @@ LOAD_OP1(psroi_pool, CPU);
 #ifdef ROI_PERSPECTIVE_OP
 LOAD_OP1(roi_perspective_transform, CPU);
 #endif
+#ifdef BEAM_SEARCH_OP
+LOAD_OP1(beam_search, CPU);
+#endif
+#ifdef BEAM_SEARCH_DECODE_OP
+LOAD_OP1(beam_search_decode, CPU);
+#endif
+#ifdef PAD2D_OP
+LOAD_OP1(pad2d, CPU);
+#endif
+#ifdef ONE_HOT_OP
+LOAD_OP1(one_hot, CPU);
+#endif
--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -221,6 +221,8 @@ inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
    } else if (tensor.type() == typeid(int32_t)) {
      printer << tensor.data<int32_t>()[i] << " ";
+    } else if (tensor.type() == typeid(bool)) {
+      printer << tensor.data<bool>()[i] << " ";
    }
  }
 #endif  // PADDLE_MOBILE_FPGA

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -58,8 +58,7 @@ struct OpInfoFiller {
  void operator()(const std::string& op_type, OpInfo<Dtype>* info) const {
    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
                        const VariableNameMap& outputs,
-                        const AttributeMap& attrs,
-                        std::shared_ptr<Scope> scope) {
+                        const AttributeMap& attrs, framework::Scope* scope) {
      return new T(type, inputs, outputs, attrs, scope);
    };
  }
@@ -91,7 +90,7 @@ class OpRegistry {
  static std::shared_ptr<OperatorBase<Dtype>> CreateOp(
      const std::string& type, const VariableNameMap& inputs,
      const VariableNameMap& outputs, const AttributeMap attrs,
-      std::shared_ptr<paddle_mobile::framework::Scope> scope) {
+      paddle_mobile::framework::Scope* scope) {
    auto& info = OpInfoMap<Dtype>::Instance()->Get(type);
    auto op = info.Creator()(type, inputs, outputs, attrs, scope);
    return std::shared_ptr<OperatorBase<Dtype>>(op);

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -43,7 +43,7 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                  const VariableNameMap &inputs,
                                  const VariableNameMap &outputs,
                                  const AttributeMap &attrs,
-                                  std::shared_ptr<Scope> scope)
+                                  framework::Scope *scope)
    : type_(type),
      inputs_(inputs),
      outputs_(outputs),
@@ -67,30 +67,22 @@ void OperatorBase<Dtype>::Run() {
  for (const auto key : input_keys) {
    auto var_vec_in = inputs_.at(key);
    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto vari = this->scope_->FindVar(var_vec_in[i]);
-      if (vari->IsInitialized()) {
-        const Tensor *tensor = vari->template Get<framework::LoDTensor>();
-        if (tensor) {
-          DLOG << type_ << " input- " << key << "=" << *tensor;
-#ifdef PADDLE_MOBILE_FPGA
-          DLOG << var_vec_in[i];
-#endif
-        }
+      auto var = this->scope_->FindVar(var_vec_in[i]);
+      if (var->IsInitialized() &&
+          var->template IsType<framework::LoDTensor>()) {
+        const Tensor *tensor = var->template Get<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
      }
    }
  }
  for (const auto key : GetOutKeys()) {
    auto var_vec_out = outputs_.at(key);
    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto vari = scope_->FindVar(var_vec_out[i]);
-      if (vari->IsInitialized()) {
-        const Tensor *tensor = vari->template Get<framework::LoDTensor>();
-        if (tensor) {
-          DLOG << type_ << " output- " << key << "=" << *tensor;
-#ifdef PADDLE_MOBILE_FPGA
-          DLOG << var_vec_out[i];
-#endif
-        }
+      auto var = scope_->FindVar(var_vec_out[i]);
+      if (var->IsInitialized() &&
+          var->template IsType<framework::LoDTensor>()) {
+        const Tensor *tensor = var->template Get<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
      }
    }
  }

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once

 #include <map>
-#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -58,7 +57,7 @@ class OperatorBase {
 public:
  OperatorBase(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs, const AttributeMap &attrs,
-               std::shared_ptr<Scope> scope);
+               framework::Scope *scope);
  virtual ~OperatorBase() {}

  virtual void Init() = 0;
@@ -81,11 +80,10 @@ class OperatorBase {
  }
 #ifdef PADDLE_MOBILE_FPGA
  void InsertTensors();
-  void ChangeNameMap(string key, std::vector<string> value);
 #endif

 protected:
-  std::shared_ptr<Scope> scope_;
+  framework::Scope *scope_;
  std::string type_;
  VariableNameMap inputs_;
  VariableNameMap outputs_;
@@ -98,35 +96,15 @@ class OperatorBase {
 template <typename Dtype, typename ParamType, typename KernelType>
 class OperatorWithKernel : public OperatorBase<Dtype> {
 public:
-#ifndef PADDLE_MOBILE_FPGA1
  OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     std::shared_ptr<Scope> scope)
+                     framework::Scope *scope)
      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, scope.get()) {
+        param_(inputs, outputs, attrs, scope) {
 #ifdef PADDLE_MOBILE_CL
    kernel_.InitCLHelper(scope->GetCLScpoe());
 #endif
  }
-#else
-  OperatorWithKernel(const std::string &type, const VariableNameMap inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     std::shared_ptr<Scope> scope)
-      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {
-    static int feed_num = 0;
-    static int fetch_num = 0;
-    if (type == "feed") {
-      auto new_name = string("feed") + std::to_string(feed_num++);
-      auto var = scope->Var(new_name);
-      (const_cast<VariableNameMap &>(inputs)).at("X") = {string(new_name)};
-    } else if (type == "fetch") {
-      auto new_name = string("fetch") + std::to_string(fetch_num++);
-      auto var = scope->Var(new_name);
-      (const_cast<VariableNameMap &>(outputs)).at("Out") = {string(new_name)};
-    }
-    param_ = ParamType(inputs, outputs, attrs, *scope);
-  }
-#endif
  virtual void RunImpl() { this->kernel_.Compute(this->param_); }

  virtual void InferShape() const = 0;
@@ -198,21 +176,20 @@ class FusionOpMatcher {
  std::shared_ptr<OpDesc> new_opdesc_;
 };

-#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                          \
-  template <typename DeviceType, typename T>                                 \
-  class OpName##Op : public framework::OperatorWithKernel<                   \
-                         DeviceType, OpParam<DeviceType>,                    \
-                         operators::OpKernel<DeviceType, T>> {               \
-   public:                                                                   \
-    OpName##Op(const std::string &type, const VariableNameMap &inputs,       \
-               const VariableNameMap &outputs,                               \
-               const framework::AttributeMap &attrs,                         \
-               std::shared_ptr<framework::Scope> scope)                      \
-        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,     \
-                                        operators::OpKernel<DeviceType, T>>( \
-              type, inputs, outputs, attrs, scope) {}                        \
-                                                                             \
-    void InferShape() const override;                                        \
+#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                           \
+  template <typename DeviceType, typename T>                                  \
+  class OpName##Op : public framework::OperatorWithKernel<                    \
+                         DeviceType, OpParam<DeviceType>,                     \
+                         operators::OpKernel<DeviceType, T>> {                \
+   public:                                                                    \
+    OpName##Op(const std::string &type, const VariableNameMap &inputs,        \
+               const VariableNameMap &outputs,                                \
+               const framework::AttributeMap &attrs, framework::Scope *scope) \
+        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,      \
+                                        operators::OpKernel<DeviceType, T>>(  \
+              type, inputs, outputs, attrs, scope) {}                         \
+                                                                              \
+    void InferShape() const override;                                         \
  };

 #define DECLARE_KERNEL(OpName, OpParam)                                   \
@@ -228,7 +205,7 @@ class FusionOpMatcher {
  cls(const std::string &type, const ::paddle_mobile::VariableNameMap &inputs, \
      const ::paddle_mobile::VariableNameMap &outputs,                         \
      const ::paddle_mobile::framework::AttributeMap &attrs,                   \
-      std::shared_ptr<::paddle_mobile::framework::Scope> scope)                \
+      ::paddle_mobile::framework::Scope *scope)                                \
      : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}

 }  // namespace framework

--- a/src/framework/program/op_desc.cpp
+++ b/src/framework/program/op_desc.cpp
@@ -42,9 +42,15 @@ OpDesc::OpDesc(PaddleMobile__Framework__Proto__OpDesc *desc) {
    PaddleMobile__Framework__Proto__OpDesc__Attr *attr = desc->attrs[k];
    std::string attr_name(attr->name);
    attrs_[attr_name] = Attribute::GetAttrValue(attr);
+    proto_attrs_.push_back(*attr);
  }
 }

+const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
+    &OpDesc::GetProtoAttr() const {
+  return proto_attrs_;
+}
+
 const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
  return inputs_.find(name)->second;
 }
@@ -58,6 +64,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
  return it->second;
 }

+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
+  this->attrs_[name].Set<BlockDesc *>(block);
+}
+
+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name].Set<std::vector<BlockDesc *>>(blocks);
+}
+
 std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() {
  return attrs_;
 }

--- a/src/framework/program/op_desc.h
+++ b/src/framework/program/op_desc.h
@@ -29,11 +29,13 @@ class OpDesc {
  friend class ProgramOptimize;
  friend class FusionOpMatcher;
  friend class Node;
+
  explicit OpDesc(PaddleMobile__Framework__Proto__OpDesc *op_desc);
  OpDesc(const OpDesc &op_desc) : type_(op_desc.type_) {
    this->inputs_ = op_desc.inputs_;
    this->outputs_ = op_desc.outputs_;
    this->attrs_ = op_desc.attrs_;
+    this->proto_attrs_ = op_desc.proto_attrs_;
  }

  OpDesc() {}
@@ -41,6 +43,12 @@ class OpDesc {
  const std::vector<std::string> &Output(const std::string &name) const;
  Attribute GetAttr(const std::string &name) const;

+  const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
+      &GetProtoAttr() const;
+
+  void SetBlockAttr(const std::string &name, BlockDesc *block);
+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> block);
+
  VariableNameMap &GetInputs() { return inputs_; }

  VariableNameMap &GetOutputs() { return outputs_; }
@@ -60,6 +68,7 @@ class OpDesc {
  VariableNameMap inputs_;
  VariableNameMap outputs_;
  AttributeMap attrs_;
+  std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr> proto_attrs_;
 };

 Print &operator<<(Print &printer, const OpDesc &op_desc);

--- a/src/framework/program/program_desc.cpp
+++ b/src/framework/program/program_desc.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <string>
 #include <vector>

+#include "framework/program/program_desc.h"
 #include "framework/program/tensor_desc.h"
-#include "program_desc.h"

 namespace paddle_mobile {
 namespace framework {
@@ -25,6 +25,25 @@ ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
  for (int i = 0; i < desc->n_blocks; ++i) {
    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
  }
+  for (auto &block : blocks_) {
+    for (auto op : block->Ops()) {
+      for (const auto &attr : op->GetProtoAttr()) {
+        if (attr.type == PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK) {
+          size_t blk_idx = attr.block_idx;
+          op->SetBlockAttr(attr.name, this->MutableBlock(blk_idx));
+        } else if (attr.type ==
+                   PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS) {
+          size_t n_blocks_idx = attr.n_blocks_idx;
+          int32_t *blks_idx = attr.blocks_idx;
+          std::vector<BlockDesc *> block_descs;
+          for (size_t i = 0; i < n_blocks_idx; ++i) {
+            block_descs.push_back(this->MutableBlock(blks_idx[i]));
+          }
+          op->SetBlocksAttr(attr.name, block_descs);
+        }
+      }
+    }
+  }
 }

 void ProgramDesc::Description(std::string header) {
@@ -60,9 +79,8 @@ void ProgramDesc::Description(std::string header) {
    }

    for (const auto &var_desc : block->Vars()) {
+      LOG(kLOG_DEBUG1) << "var name: " << var_desc->Name();
      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        LOG(kLOG_DEBUG1) << "var name: " << var_desc->Name();
-
        const TensorDesc &tensor_desc = var_desc->Tensor_desc();

        LOG(kLOG_DEBUG2) << "in var tensor desc dims size: "

--- a/src/framework/program/program_desc.h
+++ b/src/framework/program/program_desc.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include <vector>

 #include "common/types.h"
@@ -31,6 +32,14 @@ class ProgramDesc {

  std::shared_ptr<BlockDesc> Block(size_t idx);

+  BlockDesc *MutableBlock(size_t idx) {
+    if (idx == -1) {
+      return nullptr;
+    } else {
+      return blocks_[idx].get();
+    }
+  }
+
  const std::vector<std::shared_ptr<BlockDesc>> &Blocks() { return blocks_; }
  ProgramDesc(const ProgramDesc &program_desc) {
    for (auto &block : program_desc.blocks_) {

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -32,15 +32,7 @@ class Scope {
  Scope() = default;

  ~Scope() {
-    for (auto &var : vars_) {
-      delete var.second;
-    }
-    vars_.clear();
-    for (auto kid : kids_) {
-      delete kid;
-    }
-    kids_.clear();
-
+    DropKids();
 #ifdef PADDLE_MOBILE_CL
    delete cl_scope_;
 #endif

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -209,8 +209,9 @@ class Tensor : public TensorBase {
  }
  inline void set_type(std::type_index type) { holder_->set_type(type); }
  inline void *get_data() {
-    return (void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());
-  }  // NOLINT
+    return (
+        void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());  // NOLINT
+  }

  inline void *init(std::type_index type) {
    if (holder_ != nullptr) {

--- a/src/framework/tensor_util.h
+++ b/src/framework/tensor_util.h
@@ -14,13 +14,26 @@ limitations under the License. */

 #pragma once
 #include <vector>
+#include "framework/tensor.h"
 #include "memory/t_malloc.h"
-#include "tensor.h"

 namespace paddle_mobile {
 namespace framework {

-void TensorCopy(const Tensor &src, Tensor *dst);
+void TensorCopy(const Tensor& src, Tensor* dst);
+
+template <typename T>
+void TensorFromVector(const std::vector<T>& src, Tensor* dst);
+
+template <typename T>
+void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
+  auto src_ptr = static_cast<const void*>(src.data());
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>());
+  auto size = src.size() * sizeof(T);
+
+  memory::Copy(dst_ptr, src_ptr, size);
+}

 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "io/api_paddle_mobile.h"
+#include <string>
 #include <vector>
 #include "common/enforce.h"
 #include "framework/tensor.h"
@@ -145,7 +146,7 @@ void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
    tensors[i].init(typeid(float));
    ConvertPaddleTensors(inputs[i], &tensors[i]);
  }
-  paddle_mobile_->FeedTensorData(tensors);
+  // paddle_mobile_->FeedTensorData(tensors);
 }

 template <typename Device, typename T>
@@ -169,7 +170,7 @@ void PaddleMobilePredictor<Device, T>::GetPaddleTensor(const std::string &name,
                                                       PaddleTensor *output) {
  framework::Tensor *t = paddle_mobile_->GetTensorByName(name);
  ConvertTensors(*t, output);
-};
+}

 template <typename Device, typename T>
 void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {

--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include <vector>
 #include "common/types.h"
 #include "io/paddle_inference_api.h"

--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -39,8 +39,6 @@ using framework::Tensor;
 using paddle_mobile::CPU;
 using std::string;

-const char *ANDROID_LOG_TAG =
-    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
 paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 static std::mutex shared_mutex;


--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -152,14 +152,14 @@ PMStatus PaddleMobile<Device, T>::Predict() {
 }

 template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const framework::Tensor &input,
-                                   const std::string &var_name) {
+void PaddleMobile<Device, T>::Feed(const std::string &var_name,
+                                   const framework::Tensor &input) {
  executor_->SetInput(input, var_name);
 }

 template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const framework::LoDTensor &input,
-                                   const std::string &var_name) {
+void PaddleMobile<Device, T>::Feed(const std::string &var_name,
+                                   const framework::LoDTensor &input) {
  executor_->SetInput(input, var_name);
 }

@@ -227,16 +227,11 @@ template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
  executor_->FeedData(t);
 }
+
 template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
  executor_->FeedData(v);
-};
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::FeedTensorData(
-    const std::vector<framework::Tensor> &v) {
-  executor_->FeedTensorData(v);
-};
+}

 template <typename Device, typename T>
 void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
@@ -253,7 +248,7 @@ template <typename Device, typename T>
 framework::Tensor *PaddleMobile<Device, T>::GetTensorByName(
    const std::string &name) {
  return executor_->GetTensorByName(name);
-};
+}

 template <typename Device, typename T>
 std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -33,7 +33,7 @@ namespace paddle_mobile {
 template <typename Device, typename T = float>
 class PaddleMobile {
 public:
-  PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
+  explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
 #ifndef PADDLE_MOBILE_CL
    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
@@ -69,8 +69,8 @@ class PaddleMobile {
                         const std::vector<int64_t> &dims);
  PMStatus Predict();

-  void Feed(const framework::LoDTensor &input, const std::string &var_name);
-  void Feed(const framework::Tensor &input, const std::string &var_name);
+  void Feed(const std::string &var_name, const framework::LoDTensor &input);
+  void Feed(const std::string &var_name, const framework::Tensor &input);

  typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
  LoDTensorPtr Fetch(const std::string &var_name);
@@ -91,7 +91,6 @@ class PaddleMobile {
  void InjectVariable(const framework::Tensor &t, std::string var_name);
  void FeedData(const framework::Tensor &t);
  void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);

  void GetResults(std::vector<void *> *v);
  void GetTensorResults(std::vector<framework::Tensor *> *v);

--- a/src/operators/activation_op.cpp
+++ b/src/operators/activation_op.cpp
@@ -17,11 +17,12 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-#define DEFINE_ACTIVATION_INFERSHAPE(OpName)                \
-  template <typename Dtype, typename T>                     \
-  void OpName##Op<Dtype, T>::InferShape() const {           \
-    const auto &input_dims = this->param_.InputX()->dims(); \
-    this->param_.Out()->Resize(input_dims);                 \
+#define DEFINE_ACTIVATION_INFERSHAPE(OpName)                   \
+  template <typename Dtype, typename T>                        \
+  void OpName##Op<Dtype, T>::InferShape() const {              \
+    const auto &input_dims = this->param_.InputX()->dims();    \
+    this->param_.Out()->Resize(input_dims);                    \
+    this->param_.Out()->set_lod(this->param_.InputX()->lod()); \
  }

 #ifdef RELU_OP

--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -32,8 +32,7 @@ class BatchNormOp
 public:
  BatchNormOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              std::shared_ptr<framework::Scope> scope)
+              const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, BatchNormParam<DeviceType>,
                                      BatchNormKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -11,27 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_CONVADD_OP

-#include "operators/kernel/conv_add_kernel.h"
-#include "../central-arm-func/conv_add_arm_func.h"
+#ifdef BEAM_SEARCH_DECODE_OP

-namespace paddle_mobile {
-namespace operators {
+#pragma once

-template <>
-bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
-  return true;
-}
+#include "operators/beam_search_decode_op.h"

-template <>
-void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
-  ConvAddCompute<float>(param);
-}
+namespace paddle_mobile {
+namespace operators {

-template class ConvAddKernel<CPU, float>;
+template <typename Dtype, typename T>
+void BeamSearchDecodeOp<Dtype, T>::InferShape() const {}

 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(beam_search_decode, ops::BeamSearchDecodeOp);
 #endif
+
+#endif  // BEAM_SEARCH_DECODE_OP
--- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
@@ -12,27 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_CONVADDPRELU_OP
+#ifdef BEAM_SEARCH_DECODE_OP

-#include "operators/kernel/conv_add_prelu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/beam_search_decode_kernel.h"

 namespace paddle_mobile {
 namespace operators {

-template <>
-bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConvAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddPReluParam<CPU> &param) {
-  ConvAddPReluCompute<float>(param);
-}
-template class ConvAddPReluKernel<CPU, float>;
+DECLARE_OPERATOR(BeamSearchDecode, BeamSearchDecodeParam,
+                 BeamSearchDecodeKernel);

 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // BEAM_SEARCH_DECODE_OP
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -12,27 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_CONVADDRELU_OP
+#ifdef BEAM_SEARCH_OP

-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
+#pragma once
+
+#include "operators/beam_search_op.h"

 namespace paddle_mobile {
 namespace operators {

-template <>
-bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<CPU, float>::Compute(
-    const FusionConvAddReluParam<CPU> &param) {
-  ConvAddReluCompute<float, float>(param);
-}
-template class ConvAddReluKernel<CPU, float>;
+template <typename Dtype, typename T>
+void BeamSearchOp<Dtype, T>::InferShape() const {}

 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(beam_search, ops::BeamSearchOp);
 #endif
+
+#endif  // BEAM_SEARCH_OP
--- a/src/operators/beam_search_op.h
+++ b/src/operators/beam_search_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BEAM_SEARCH_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/beam_search_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+DECLARE_OPERATOR(BeamSearch, BeamSearchParam, BeamSearchKernel);
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // BEAM_SEARCH_OP
--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
@@ -34,8 +34,7 @@ class BilinearOp : public framework::OperatorWithKernel<
 public:
  BilinearOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, BilinearInterpParam<DeviceType>,
            operators::BilinearInterpKernel<DeviceType, T>>(

--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -34,8 +34,7 @@ class BoxCoderOp : public framework::OperatorWithKernel<
 public:
  BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
                                      operators::BoxCoderKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/cast_op.h
+++ b/src/operators/cast_op.h
@@ -31,7 +31,7 @@ class CastOp : public framework::OperatorWithKernel<
 public:
  CastOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, CastParam<DeviceType>,
                                      operators::CastKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -30,7 +30,7 @@ class ConcatOp : public framework::OperatorWithKernel<
 public:
  ConcatOp(const string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           std::shared_ptr<framework::Scope> scope)
+           framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
                                      operators::ConcatKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -30,7 +30,7 @@ class ConvOp : public framework::OperatorWithKernel<
 public:
  ConvOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/conv_transpose_op.h
+++ b/src/operators/conv_transpose_op.h
@@ -31,8 +31,7 @@ class ConvOpTranspose : public framework::OperatorWithKernel<
 public:
  ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ConvTransposeParam<DeviceType>,
            operators::ConvTransposeKernel<DeviceType, T>>(

--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
@@ -33,7 +33,7 @@ class CrfOp : public framework::OperatorWithKernel<
 public:
  CrfOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
                                      operators::CrfKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -30,8 +30,7 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 public:
  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/dequantize_op.h
+++ b/src/operators/dequantize_op.h
@@ -32,8 +32,7 @@ class DequantizeOp
 public:
  DequantizeOp(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               std::shared_ptr<framework::Scope> scope)
+               const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
                                      DequantizeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -34,7 +34,7 @@ class DropoutOp : public framework::OperatorWithKernel<
 public:
  DropoutOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs, const framework::AttributeMap attrs,
-            std::shared_ptr<framework::Scope> scope)
+            framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
                                      operators::DropoutKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -23,6 +23,7 @@ template <typename Dtype, typename T>
 void ElementwiseAddOp<Dtype, T>::InferShape() const {
  auto x_dim = this->param_.InputX()->dims();
  this->param_.Out()->Resize(x_dim);
+  this->param_.Out()->set_lod(this->param_.InputX()->lod());
 }

 }  // namespace operators

--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -32,7 +32,7 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseAddParam<DeviceType>,
            operators::ElementwiseAddKernel<DeviceType, T>>(

--- a/src/operators/elementwise_mul_op.h
+++ b/src/operators/elementwise_mul_op.h
@@ -32,7 +32,7 @@ class ElementwiseMulOp : public framework::OperatorWithKernel<
  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseMulParam<DeviceType>,
            operators::ElementwiseMulKernel<DeviceType, T>>(

--- a/src/operators/elementwise_sub_op.h
+++ b/src/operators/elementwise_sub_op.h
@@ -32,7 +32,7 @@ class ElementwiseSubOp : public framework::OperatorWithKernel<
  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseSubParam<DeviceType>,
            operators::ElementwiseSubKernel<DeviceType, T>>(

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -21,7 +21,8 @@ template <typename DeviceType, typename T>
 void FeedOp<DeviceType, T>::InferShape() const {
  auto out_dims = this->param_.Out()->dims();
  out_dims[0] = this->param_.BatchSize();
-  auto input_dims = this->param_.InputX()->dims();
+  int col = this->param_.Col();
+  auto input_dims = this->param_.InputX()->at(col).dims();
  if (input_dims.size() == 4) {
    this->param_.Out()->Resize(input_dims);
  } else {

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -31,7 +31,7 @@ class FeedOp
 public:
  FeedOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)

      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
                                      FeedKernel<DeviceType, T>>(

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -18,8 +18,9 @@ namespace operators {

 template <typename DeviceType, typename T>
 void FetchOp<DeviceType, T>::InferShape() const {
+  int col = this->param_.Col();
  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
+  this->param_.Out()->at(col).Resize(x_dims);
 }

 }  // namespace operators

--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -30,7 +30,7 @@ class FetchOp
 public:
  FetchOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
                                      FetchKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
@@ -31,11 +31,10 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
 public:
  FillConstantOp(const std::string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap attrs, framework::Scope *scope)
      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                            scope),
-        param_(inputs, outputs, attrs, scope.get()) {}
+        param_(inputs, outputs, attrs, scope) {}
  void RunImpl() {
    auto data_type =
        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(

--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -49,8 +49,7 @@ class FlattenOp : public framework::OperatorWithKernel<
 public:
  FlattenOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            std::shared_ptr<framework::Scope> scope)
+            const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
                                      operators::FlattenKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_conv_add_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_add_prelu_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#include "operators/fusion_conv_add_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
-                        ops::FusionConvAddAddPReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-
-#endif  // FUSION_CONVADDADDPRELU_OP
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_add_prelu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddAddPReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD,
-                   {{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}},
-                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
-
-                 removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; }
-
-  std::vector<std::pair<int, std::string>> NeedCheck() {
-    DLOG << " conv add add prelu check add X ";
-    return {{2, "Y"}, {2, "X"}};
-  }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddAddPReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-          operators::ConvAddAddPReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-            operators::ConvAddAddPReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP

 #include "operators/fusion_conv_add_bn_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_bn_op.h
+++ b/src/operators/fusion_conv_add_bn_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/conv_add_bn_kernel.h"
+#include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
@@ -59,7 +59,7 @@ class FusionConvAddBNOp : public framework::OperatorWithKernel<
  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddBNParam<DeviceType>,
            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP

 #include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -61,7 +61,7 @@ class FusionConvAddBNReluOp
  FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddBNReluParam<DeviceType>,
            operators::ConvAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP

 #include "operators/fusion_conv_add_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_op.h
+++ b/src/operators/fusion_conv_add_op.h
@@ -50,8 +50,7 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 public:
  FusionConvAddOp(const string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType,
                                      FusionConvAddParam<DeviceType>,
                                      operators::ConvAddKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_prelu_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDPRELU_OP
-
-#include "operators/fusion_conv_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
-                        ops::FusionConvAddPReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-
-#endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP

 #include "operators/fusion_conv_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -51,7 +51,7 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs,
                      const framework::AttributeMap &attrs,
-                      std::shared_ptr<framework::Scope> scope)
+                      framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddReluParam<DeviceType>,
            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNADDRELU_OP

 #include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -67,7 +67,7 @@ class FusionConvBNAddReluOp
  FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvBNAddReluParam<DeviceType>,
            operators::ConvBNAddReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_bn_op.cpp
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP

 #include "operators/fusion_conv_bn_op.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_op.h
+++ b/src/operators/fusion_conv_bn_op.h
@@ -56,8 +56,7 @@ class FusionConvBNOp : public framework::OperatorWithKernel<
 public:
  FusionConvBNOp(const string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FusionConvBNParam<DeviceType>,
                                      operators::ConvBNKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP

 #include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -58,7 +58,7 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs,
                     const framework::AttributeMap &attrs,
-                     std::shared_ptr<framework::Scope> scope)
+                     framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvBNReluParam<DeviceType>,
            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_bn_op.h
+++ b/src/operators/fusion_deconv_add_bn_op.h
@@ -57,7 +57,7 @@ class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
  FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs,
                      const framework::AttributeMap &attrs,
-                      std::shared_ptr<framework::Scope> scope)
+                      framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddBNParam<DeviceType>,
            operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_bn_relu_op.h
+++ b/src/operators/fusion_deconv_add_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDeconvAddBNReluOp
  FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
+                          framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
            operators::DeconvAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_deconv_add_op.h
+++ b/src/operators/fusion_deconv_add_op.h
@@ -49,7 +49,7 @@ class FusionDeconvAddOp : public framework::OperatorWithKernel<
  FusionDeconvAddOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddParam<DeviceType>,
            operators::DeconvAddKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_relu_op.h
+++ b/src/operators/fusion_deconv_add_relu_op.h
@@ -51,7 +51,7 @@ class FusionDeconvAddReluOp
  FusionDeconvAddReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddReluParam<DeviceType>,
            operators::DeconvAddReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_deconv_bn_relu_op.h
+++ b/src/operators/fusion_deconv_bn_relu_op.h
@@ -56,7 +56,7 @@ class FusionDeconvBNReluOp
  FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvBNReluParam<DeviceType>,
            operators::DeconvBNReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_relu_op.h
+++ b/src/operators/fusion_deconv_relu_op.h
@@ -48,7 +48,7 @@ class FusionDeconvReluOp : public framework::OperatorWithKernel<
  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs,
                     const framework::AttributeMap &attrs,
-                     std::shared_ptr<framework::Scope> scope)
+                     framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvReluParam<DeviceType>,
            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_dequant_add_bn_op.h
+++ b/src/operators/fusion_dequant_add_bn_op.h
@@ -60,7 +60,7 @@ class FusionDequantAddBNOp
  FusionDequantAddBNOp(const std::string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNParam<DeviceType>,
            operators::FusionDequantAddBNKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
@@ -62,7 +62,7 @@ class FusionDequantAddBNReluOp
                           const VariableNameMap &inputs,
                           const VariableNameMap &outputs,
                           const framework::AttributeMap &attrs,
-                           std::shared_ptr<framework::Scope> scope)
+                           framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNParam<DeviceType>,
            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_quant_op.h
@@ -62,7 +62,7 @@ class FusionDequantAddBNReluQuantOp
                                const VariableNameMap &inputs,
                                const VariableNameMap &outputs,
                                const framework::AttributeMap &attrs,
-                                std::shared_ptr<framework::Scope> scope)
+                                framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
            operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
@@ -109,7 +109,7 @@ class FusionDequantAddBNQuantOp
                            const VariableNameMap &inputs,
                            const VariableNameMap &outputs,
                            const framework::AttributeMap &attrs,
-                            std::shared_ptr<framework::Scope> scope)
+                            framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
            operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_bn_op.h
+++ b/src/operators/fusion_dequant_bn_op.h
@@ -58,7 +58,7 @@ class FusionDequantBNOp : public framework::OperatorWithKernel<
  FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantBNParam<DeviceType>,
            operators::FusionDequantBNKernel<DeviceType, T>>(
@@ -87,7 +87,7 @@ class FusionDequantBNReluOp
  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantBNParam<DeviceType>,
            operators::FusionDequantBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_bn_relu_op.h
+++ b/src/operators/fusion_dequant_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDequantBNReluOp
  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantBNReluParam<DeviceType>,
            operators::FusionDequantBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_DWCONVBNRELU_OP

 #include "operators/fusion_dwconv_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDWConvBNReluOp
  FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDWConvBNReluParam<DeviceType>,
            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_elementwise_add_relu_op.h
+++ b/src/operators/fusion_elementwise_add_relu_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #pragma once

 #include <string>
+#include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
 #include "operators/kernel/elementwise_add_relu_kernel.h"
@@ -50,7 +51,7 @@ class FusionElementwiseAddReluOp
  FusionElementwiseAddReluOp(const string &type, const VariableNameMap &inputs,
                             const VariableNameMap &outputs,
                             const framework::AttributeMap &attrs,
-                             std::shared_ptr<framework::Scope> scope)
+                             framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseAddReluParam<DeviceType>,
            operators::ElementwiseAddReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -50,8 +50,7 @@ class FusionFcOp : public framework::OperatorWithKernel<
 public:
  FusionFcOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
                                      operators::FusionFcKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
@@ -49,8 +49,7 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
 public:
  FusionFcReluOp(const string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionFcReluParam<DeviceType>,
            operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
@@ -33,7 +33,7 @@ class GruOp : public framework::OperatorWithKernel<
 public:
  GruOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
                                      operators::GruKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/gru_unit_op.h
+++ b/src/operators/gru_unit_op.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/gru_unit_kernel.h"
 #include "operators/op_param.h"
@@ -30,10 +31,10 @@ class GruUnitOp : public framework::OperatorWithKernel<
 public:
  GruUnitOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs, const AttributeMap &attrs,
-            std::shared_ptr<Scope> scope)
+            framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, GruUnitParam<DeviceType>,
                                      operators::GruUnitKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope){};
+            type, inputs, outputs, attrs, scope) {}
  void InferShape() const override;
 };


--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -31,8 +31,7 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
 public:
  Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
                const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                std::shared_ptr<framework::Scope> scope)
+                const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, Im2SequenceParam<DeviceType>,
            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/increment_op.h
+++ b/src/operators/increment_op.h
@@ -32,8 +32,7 @@ class IncrementOp
 public:
  IncrementOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              std::shared_ptr<framework::Scope> scope)
+              const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, IncrementParam<DeviceType>,
                                      IncrementKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/is_empty_op.h
+++ b/src/operators/is_empty_op.h
@@ -31,8 +31,7 @@ class IsEmptyOp
 public:
  IsEmptyOp(const string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            std::shared_ptr<framework::Scope> scope)
+            const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, IsEmptyParam<DeviceType>,
                                      IsEmptyKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/kernel/arm/beam_search_decode_kernel.cpp
+++ b/src/operators/kernel/arm/beam_search_decode_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BEAM_SEARCH_DECODE_OP
+
+#include "operators/kernel/beam_search_decode_kernel.h"
+#include "framework/data_type.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+// all the lod have 2 levels.
+// The first is source level, the second is sentence level.
+// source level describe how many prefixes (branchs) for each source sentece
+// (beam). sentence level describe how these candidates belong to the prefixes.
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+
+template <typename T>
+struct Sentence {
+  std::vector<int64_t> word_ids;
+  std::vector<T> scores;
+};
+
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+
+template <typename T>
+struct BeamSearchDecoder {
+  BeamSearchDecoder(size_t beam_size, int end_id)
+      : beam_size_(beam_size), end_id_(end_id) {}
+
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   *  reverse: whether ids of sentence in sentence_vector_list is reversed
+   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+      LoDTensor* score_tensor, bool reverse = true,
+      bool sort_by_score = true) const;
+
+  /**
+   * Gather the hypotheses for each source sentence by backtrace though the
+   * LoDTensorArray step_ids whose lods reserve the path in the tree.
+   */
+  void Backtrace(const LoDTensorArray& step_ids,
+                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                 LoDTensor* score_tensor) const;
+
+  size_t beam_size_;
+  int end_id_;
+};
+
+template <typename T>
+void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
+    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
+  size_t src_num = sentence_vector_list.size();
+
+  PADDLE_MOBILE_ENFORCE(src_num > 0, "src_num should be larger than 0");
+
+  std::vector<size_t> source_level_lod = {0};
+  std::vector<size_t> sentence_level_lod = {0};
+  std::vector<int64_t> id_data;
+  std::vector<T> score_data;
+
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    if (sort_by_score) {
+      sort(sentence_vector_list[src_idx].begin(),
+           sentence_vector_list[src_idx].end(),
+           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
+             if (reverse)
+               return a.scores.front() > b.scores.front();
+             else
+               return a.scores.back() > b.scores.back();
+           });
+    }
+    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+      if (reverse) {
+        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
+                       sentence.word_ids.rend());
+        score_data.insert(score_data.end(), sentence.scores.rbegin(),
+                          sentence.scores.rend());
+      } else {
+        id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                       sentence.word_ids.end());
+        score_data.insert(score_data.end(), sentence.scores.begin(),
+                          sentence.scores.end());
+      }
+
+      sentence_level_lod.push_back(sentence_level_lod.back() +
+                                   sentence.word_ids.size());
+    }
+    source_level_lod.push_back(source_level_lod.back() +
+                               sentence_vector_list[src_idx].size());
+  }
+
+  framework::LoD lod;
+  lod.push_back(source_level_lod);
+  lod.push_back(sentence_level_lod);
+
+  id_tensor->set_lod(lod);
+  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+  id_tensor->mutable_data<int64_t>();
+  framework::TensorFromVector<int64_t>(id_data, id_tensor);
+
+  score_tensor->set_lod(lod);
+  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+  score_tensor->mutable_data<T>();
+  framework::TensorFromVector<T>(score_data, score_tensor);
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
+                                     const LoDTensorArray& step_scores,
+                                     LoDTensor* id_tensor,
+                                     LoDTensor* score_tensor) const {
+  PADDLE_MOBILE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_MOBILE_ENFORCE(step_ids.size() == step_scores.size(),
+                        "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+  std::vector<SentenceVector<T>> sentence_vector_list(
+      src_num, SentenceVector<T>(beam_size_));
+  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
+  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
+    auto& cur_ids = step_ids.at(step_id);
+    auto& cur_scores = step_scores.at(step_id);
+    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+      // for each source sentence
+      auto& sentence_vector = sentence_vector_list.at(src_idx);
+      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
+      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
+                                        // or the last time step
+        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
+             ++prefix_idx) {
+          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          size_t candidate_end =
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            prefix_idx_vector.push_back(prefix_idx);
+            size_t idx = prefix_idx_vector.size() - 1;
+            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+            auto cur_score = cur_scores.data<T>()[candidate_idx];
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+        }
+      } else {  // use prefix_idx_vector to backtrace
+        size_t src_candidate_start =
+            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
+        size_t prefix_idx = src_prefix_start;
+        size_t candidate_num =
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
+          auto candidate_idx = prefix_idx_vector.at(idx);
+          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+          auto cur_score = cur_scores.data<T>()[candidate_idx];
+          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
+            // to skip redundant end tokens
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+
+          while (src_candidate_start + candidate_num <=
+                 candidate_idx) {  // search the corresponding prefix
+            prefix_idx++;
+            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          }
+          prefix_idx_vector.at(idx) = prefix_idx;
+        }
+      }
+    }
+  }
+
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor, true, true);
+}
+
+struct BeamSearchDecodeFunctor {
+  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
+                          const LoDTensorArray& step_scores,
+                          LoDTensor* id_tensor, LoDTensor* score_tensor,
+                          size_t beam_size, int end_id)
+      : beam_size_(beam_size),
+        end_id_(end_id),
+        step_ids_(step_ids),
+        step_scores_(step_scores),
+        id_tensor_(id_tensor),
+        score_tensor_(score_tensor) {}
+
+  template <typename T>
+  void apply() const;
+
+  size_t beam_size_;
+  int end_id_;
+  const LoDTensorArray& step_ids_;
+  const LoDTensorArray& step_scores_;
+  LoDTensor* id_tensor_;
+  LoDTensor* score_tensor_;
+};
+
+template <typename T>
+void BeamSearchDecodeFunctor::apply() const {
+  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
+  beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
+                                score_tensor_);
+}
+
+template <>
+void BeamSearchDecodeFunctor::apply<bool>() const {
+  PADDLE_MOBILE_THROW_EXCEPTION("beam search decode op does not support bool.");
+}
+
+template <>
+bool BeamSearchDecodeKernel<CPU, float>::Init(
+    BeamSearchDecodeParam<CPU>* param) {
+  return true;
+}
+
+template <>
+void BeamSearchDecodeKernel<CPU, float>::Compute(
+    const BeamSearchDecodeParam<CPU>& param) {
+  const LoDTensorArray* ids = param.ids_;
+  const LoDTensorArray* scores = param.scores_;
+
+  const size_t step_num = ids->size();
+  PADDLE_MOBILE_ENFORCE(step_num > 0,
+                        "beam search steps should be larger than 0");
+
+  for (size_t i = 0; i < step_num; ++i) {
+    PADDLE_MOBILE_ENFORCE(ids->at(i).lod().size() == 2,
+                          "Level of LodTensor should be 2");
+  }
+  const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+  PADDLE_MOBILE_ENFORCE(source_num > 0, "source num should be larger than 0");
+
+  LoDTensor* sentence_ids = param.sentence_ids_;
+  LoDTensor* sentence_scores = param.sentence_scores_;
+
+  framework::VisitDataType(
+      framework::ToDataType(scores->at(0).type()),
+      BeamSearchDecodeFunctor(*ids, *scores, sentence_ids, sentence_scores,
+                              param.beam_size_, param.end_id_));
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/beam_search_kernel.cpp
+++ b/src/operators/kernel/arm/beam_search_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BEAM_SEARCH_OP
+
+#include "operators/kernel/beam_search_kernel.h"
+#include <numeric>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Device, typename T>
+class BeamSearchFunctor {
+ public:
+  void operator()(const framework::LoDTensor *pre_ids,
+                  const framework::LoDTensor *pre_scores,
+                  const framework::LoDTensor *ids,
+                  const framework::LoDTensor *scores,
+                  framework::LoDTensor *selected_ids,
+                  framework::LoDTensor *selected_scores,
+                  framework::Tensor *parent_idx, size_t level, size_t beam_size,
+                  int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+    auto &high_level = abs_lod[level];
+
+    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
+                                        beam_size, end_id, is_accumulated);
+    auto selected_items = ToMap(items, high_level.back());
+
+    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
+    // calculate the output tensor's height
+    size_t num_instances = std::accumulate(
+        std::begin(selected_items), std::end(selected_items), 0,
+        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+    // the output tensor shape should be [num_instances, 1]
+    auto dims = framework::make_ddim(
+        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+    selected_ids->Resize(dims);
+    selected_scores->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
+
+    auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
+    auto *selected_scores_data = selected_scores->mutable_data<float>();
+    auto *parent_idx_data = parent_idx->mutable_data<int>();
+
+    // fill in data
+    std::vector<size_t> low_level;
+    size_t low_offset = 0;
+    for (auto &items : selected_items) {
+      low_level.push_back(low_offset);
+      for (auto &item : items) {
+        parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
+        selected_ids_data[low_offset] = item.id;
+        selected_scores_data[low_offset] = item.score;
+        low_offset++;
+      }
+    }
+    low_level.push_back(low_offset);
+
+    // fill lod
+    framework::LoD lod(2);
+    lod[0].assign(high_level.begin(), high_level.end());
+    lod[1].assign(low_level.begin(), low_level.end());
+    selected_ids->set_lod(lod);
+    selected_scores->set_lod(lod);
+  }
+
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the higher lod level.
+    size_t offset;
+    // prefix id in the lower lod level.
+    // size_t prefix;
+    // the candidate id
+    size_t id;
+    // the corresponding score
+    float score;
+
+    inline bool operator<(const Item &in) const {
+      return (score < in.score) ||
+             ((score == in.score) && (offset < in.offset));
+    }
+
+    inline void operator=(const Item &in) {
+      offset = in.offset;
+      id = in.id;
+      score = in.score;
+    }
+  };
+
+ protected:
+  /*
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
+   */
+  void PruneEndBeams(const framework::LoDTensor *pre_ids,
+                     const framework::LoD &abs_lod,
+                     std::vector<std::vector<Item>> *items, size_t lod_level,
+                     int end_id) {
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto &high_level = abs_lod[lod_level];
+    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+      size_t src_prefix_start = high_level[src_idx];
+      size_t src_prefix_end = high_level[src_idx + 1];
+      bool finish_flag = true;
+      for (size_t offset = src_prefix_start; offset < src_prefix_end;
+           offset++) {
+        for (auto &item : items->at(offset)) {
+          if (item.id != static_cast<size_t>(end_id) ||
+              pre_ids_data[offset] != end_id) {
+            finish_flag = false;
+            break;
+          }
+        }
+        if (!finish_flag) break;
+      }
+      if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                          // prune this beam
+        for (size_t offset = src_prefix_start; offset < src_prefix_end;
+             offset++)
+          items->at(offset).clear();
+      }
+    }
+  }
+
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance.
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>> &items, size_t element_num) {
+    std::vector<std::vector<Item>> result;
+    result.resize(element_num);
+    for (auto &entries : items) {
+      for (const auto &item : entries) {
+        result[item.offset].push_back(item);
+      }
+    }
+    return result;
+  }
+
+  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
+              size_t beam_size) {
+    std::vector<Item> &top_beam = *top_beam_ptr;
+
+    size_t num_beams = top_beam.size();
+    if (num_beams < beam_size) {
+      top_beam.resize(num_beams + 1);
+      num_beams++;
+    } else {
+      if (item < top_beam[beam_size - 1]) {
+        return;
+      }
+    }
+
+    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
+      if (top_beam[k] < item) {
+        top_beam[k + 1] = top_beam[k];
+      } else {
+        top_beam[k + 1] = item;
+        return;
+      }
+    }
+    top_beam[0] = item;
+  }
+
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor *pre_ids,
+      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
+      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
+      int end_id, bool is_accumulated) {
+    std::vector<std::vector<Item>> result;
+
+    // find the current candidates
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto *pre_scores_data = pre_scores->data<float>();
+
+    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
+    auto *scores_data = scores->data<float>();
+
+    size_t num_seqs = scores->NumElements(lod_level);
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+
+    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
+      size_t seq_offset_start = abs_lod[lod_level][seq_id];
+      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
+
+      std::vector<Item> top_beam;
+      top_beam.reserve(beam_size);
+
+      for (size_t offset = seq_offset_start; offset < seq_offset_end;
+           ++offset) {
+        auto pre_id = pre_ids_data[offset];
+        auto pre_score = pre_scores_data[offset];
+        if (pre_id == end_id) {
+          // Allocate all probability mass to end_id for finished branchs and
+          // the other candidate ids can be ignored.
+          Item item(offset, end_id, pre_score);
+          Insert(&top_beam, item, beam_size);
+        } else {
+          size_t index = offset * seq_width;
+          for (size_t d = 0; d < seq_width; d++, index++) {
+            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
+            float score = is_accumulated
+                              ? scores_data[index]
+                              : pre_score + std::log(scores_data[index]);
+            Item item(offset, id, score);
+            Insert(&top_beam, item, beam_size);
+          }
+        }
+      }
+
+      result.emplace_back(top_beam);
+    }
+
+    return result;
+  }
+};
+
+template <>
+bool BeamSearchKernel<CPU, float>::Init(BeamSearchParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void BeamSearchKernel<CPU, float>::Compute(const BeamSearchParam<CPU> &param) {
+  BeamSearchFunctor<CPU, float> alg;
+  alg(param.pre_ids_, param.pre_scores_, param.ids_, param.scores_,
+      param.selected_ids_, param.selected_scores_, param.parent_idx_,
+      param.level_, param.beam_size_, param.end_id_, param.is_accumulated_);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -16,7 +16,9 @@ limitations under the License. */

 #include "operators/kernel/conv_add_bn_relu_kernel.h"
 #include <cmath>
-#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
+#include "operators/kernel/arm/convolution/conv_common.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"

 namespace paddle_mobile {
 namespace operators {
@@ -43,9 +45,9 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
  }
  //  Tensor *new_scale = new Tensor();
  //  Tensor *new_bias = new Tensor();
+  auto *new_scale = param->CreateNewScale<framework::LoDTensor>();
+  auto *new_bias = param->CreateNewBiase<framework::LoDTensor>();

-  Tensor *new_scale = param->CreateNewScale<Tensor>();
-  Tensor *new_bias = param->CreateNewBiase<Tensor>();
  auto new_scale_ptr = new_scale->mutable_data<float>({C});
  auto new_bias_ptr = new_bias->mutable_data<float>({C});
  for (int i = 0; i < C; i++) {
@@ -54,14 +56,36 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
  }
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
+
+  InitBaseConvKernel(param);
  return true;
 }

 template <>
 void ConvAddBNReluKernel<CPU, float>::Compute(
    const FusionConvAddBNReluParam<CPU> &param) {
-  ConvAddBNReluCompute<float>(param);
+  switch (param.ExecMode()) {
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
+      DepthwiseConv3x3<float, float>(param);
+      break;
+    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
+      DepthwiseConv5x5<float, float>(param);
+      break;
+    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<8, 3>(param);
+      break;
+    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
+      GemmConv<float, float>(param);
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
+
 template class ConvAddBNReluKernel<CPU, float>;

 }  // namespace operators

--- a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+#include "operators/kernel/arm/convolution/conv_common.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
+  InitBaseConvKernel(param);
+  return true;
+}
+
+template <>
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
+  switch (param.ExecMode()) {
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
+      break;
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
+      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
+                                             param.Paddings(), param.Output());
+      break;
+    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
+      DepthwiseConv5x5<float, float>(param);
+      break;
+    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<8, 3>(param);
+      break;
+    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
+      GemmConv<float, float>(param);
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
+  math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(), param.Output());
+}
+
+template class ConvAddKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
--- a/src/operators/kernel/arm/convolution/conv_common.h
+++ b/src/operators/kernel/arm/convolution/conv_common.h
--- a/src/operators/kernel/arm/convolution/conv_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_kernel.cpp
--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/feed_kernel.cpp
+++ b/src/operators/kernel/arm/feed_kernel.cpp
--- a/src/operators/kernel/arm/fetch_kernel.cpp
+++ b/src/operators/kernel/arm/fetch_kernel.cpp
--- a/src/operators/kernel/arm/one_hot_kernel.cpp
+++ b/src/operators/kernel/arm/one_hot_kernel.cpp
--- a/src/operators/kernel/arm/pad2d_kernel.cpp
+++ b/src/operators/kernel/arm/pad2d_kernel.cpp
--- a/src/operators/kernel/arm/sequence_expand_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_expand_kernel.cpp
--- a/src/operators/kernel/arm/sequence_softmax_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_softmax_kernel.cpp
--- a/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+++ b/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
--- a/src/operators/kernel/arm/while_kernel.cpp
+++ b/src/operators/kernel/arm/while_kernel.cpp
--- a/src/operators/kernel/beam_search_decode_kernel.h
+++ b/src/operators/kernel/beam_search_decode_kernel.h
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
--- a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
--- a/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
--- a/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+++ b/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
--- a/src/operators/kernel/central-arm-func/increment_arm_func.h
+++ b/src/operators/kernel/central-arm-func/increment_arm_func.h
--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
--- a/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
--- a/src/operators/kernel/conv_bn_kernel.h
+++ b/src/operators/kernel/conv_bn_kernel.h
--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
--- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
--- a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
--- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
--- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
--- a/src/operators/kernel/fpga/V1/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/split_kernel.cpp
--- a/src/operators/kernel/fpga/V1/tanh_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/tanh_kernel.cpp
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
--- a/src/operators/kernel/pad2d_kernel.h
+++ b/src/operators/kernel/pad2d_kernel.h
--- a/src/operators/kernel/while_kernel.h
+++ b/src/operators/kernel/while_kernel.h
--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
--- a/src/operators/math/activation.h
+++ b/src/operators/math/activation.h
--- a/src/operators/math/channel_wise.h
+++ b/src/operators/math/channel_wise.h
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
--- a/src/operators/math/depthwise_conv3x3.h
+++ b/src/operators/math/depthwise_conv3x3.h
--- a/src/operators/math/depthwise_conv3x3_int8.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8.cpp
--- a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
--- a/src/operators/math/depthwise_conv5x5.cpp
+++ b/src/operators/math/depthwise_conv5x5.cpp
--- a/src/operators/math/depthwise_conv5x5.h
+++ b/src/operators/math/depthwise_conv5x5.h
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
--- a/src/operators/math/gemm/cblas.cc
+++ b/src/operators/math/gemm/cblas.cc
--- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
--- a/src/operators/math/gemm/executor.h
+++ b/src/operators/math/gemm/executor.h
--- a/src/operators/math/gemm/gemm_kernel.h
+++ b/src/operators/math/gemm/gemm_kernel.h
--- a/src/operators/math/gemm/pack_kernel.h
+++ b/src/operators/math/gemm/pack_kernel.h
--- a/src/operators/math/gemm/strategy.h
+++ b/src/operators/math/gemm/strategy.h
--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
--- a/src/operators/math/math_func_neon.h
+++ b/src/operators/math/math_func_neon.h
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
--- a/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp
--- a/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
--- a/src/operators/norm_op.h
+++ b/src/operators/norm_op.h
--- a/src/operators/one_hot_op.cpp
+++ b/src/operators/one_hot_op.cpp
--- a/src/operators/one_hot_op.h
+++ b/src/operators/one_hot_op.h
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/pad2d_op.cpp
+++ b/src/operators/pad2d_op.cpp
--- a/src/operators/pad2d_op.h
+++ b/src/operators/pad2d_op.h
--- a/src/operators/polygon_box_transform_op.h
+++ b/src/operators/polygon_box_transform_op.h
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
--- a/src/operators/quantize_op.h
+++ b/src/operators/quantize_op.h
--- a/src/operators/reshape2_op.h
+++ b/src/operators/reshape2_op.h
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
--- a/src/operators/sequence_ops/sequence_expand_op.h
+++ b/src/operators/sequence_ops/sequence_expand_op.h
--- a/src/operators/sequence_ops/sequence_pool_op.h
+++ b/src/operators/sequence_ops/sequence_pool_op.h
--- a/src/operators/sequence_ops/sequence_softmax_op.h
+++ b/src/operators/sequence_ops/sequence_softmax_op.h
--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
--- a/src/operators/sum_op.h
+++ b/src/operators/sum_op.h
--- a/src/operators/top_k_op.cpp
+++ b/src/operators/top_k_op.cpp
--- a/src/operators/top_k_op.h
+++ b/src/operators/top_k_op.h
--- a/src/operators/transpose2_op.h
+++ b/src/operators/transpose2_op.h
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
--- a/test/fpga/test_rfcn_api.cpp
+++ b/test/fpga/test_rfcn_api.cpp
--- a/test/net/test_benchmark.cpp
+++ b/test/net/test_benchmark.cpp
--- a/test/net/test_ocr.cpp
+++ b/test/net/test_ocr.cpp
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
--- a/test/operators/test_cast_op.cpp
+++ b/test/operators/test_cast_op.cpp
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
--- a/test/operators/test_conv_bn_relu_op.cpp
+++ b/test/operators/test_conv_bn_relu_op.cpp
--- a/test/operators/test_conv_op.cpp
+++ b/test/operators/test_conv_op.cpp
--- a/test/operators/test_dequantize_op.cpp
+++ b/test/operators/test_dequantize_op.cpp
--- a/test/operators/test_dwconv_bn_relu_op.cpp
+++ b/test/operators/test_dwconv_bn_relu_op.cpp
--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
--- a/test/operators/test_increment_op.cpp
+++ b/test/operators/test_increment_op.cpp
--- a/test/operators/test_is_empty_op.cpp
+++ b/test/operators/test_is_empty_op.cpp
--- a/test/operators/test_less_than_op.cpp
+++ b/test/operators/test_less_than_op.cpp
--- a/test/operators/test_log_op.cpp
+++ b/test/operators/test_log_op.cpp
--- a/test/operators/test_logical_and_op.cpp
+++ b/test/operators/test_logical_and_op.cpp
--- a/test/operators/test_logical_not_op.cpp
+++ b/test/operators/test_logical_not_op.cpp
--- a/test/operators/test_logical_or_op.cpp
+++ b/test/operators/test_logical_or_op.cpp
--- a/test/operators/test_logical_xor_op.cpp
+++ b/test/operators/test_logical_xor_op.cpp
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
--- a/test/operators/test_relu6_op.cpp
+++ b/test/operators/test_relu6_op.cpp
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
--- a/test/operators/test_reshape2_op.cpp
+++ b/test/operators/test_reshape2_op.cpp
--- a/test/operators/test_sequence_expand_op.cpp
+++ b/test/operators/test_sequence_expand_op.cpp
--- a/test/operators/test_sequence_pool_op.cpp
+++ b/test/operators/test_sequence_pool_op.cpp
--- a/test/operators/test_sequence_softmax_op.cpp
+++ b/test/operators/test_sequence_softmax_op.cpp
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
--- a/test/operators/test_tanh_op.cpp
+++ b/test/operators/test_tanh_op.cpp
--- a/test/operators/test_topk_op.cpp
+++ b/test/operators/test_topk_op.cpp
--- a/test/operators/test_transpose2_op.cpp
+++ b/test/operators/test_transpose2_op.cpp
--- a/test/test_helper.h
+++ b/test/test_helper.h
--- a/tools/ci_build.sh
+++ b/tools/ci_build.sh
--- a/tools/op.cmake
+++ b/tools/op.cmake