Implement while op, fix feed/fetch and fix code style

17ef795c · hjchen2 · f89ffde2 · 17ef795c · 17ef795c · 17ef795c
128 changed file
--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -37,8 +37,7 @@ template <typename Dtype>
 using OpCreator = std::function<framework::OperatorBase<Dtype> *(
    const std::string & /*type*/, const VariableNameMap & /*inputs*/,
    const VariableNameMap & /*outputs*/,
-    const framework::AttributeMap & /*attrs*/,
-    std::shared_ptr<framework::Scope> /*scope*/)>;
+    const framework::AttributeMap & /*attrs*/, framework::Scope * /*scope*/)>;

 using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
                                          framework::BlockDesc * /*block*/)>;

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -66,7 +66,7 @@ Executor<Device, T>::Executor(const Program<Device> &program,

    auto op_handler = OpRegistry<Device>::CreateOp(
        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-        op_desc->GetAttrMap(), program_.scope);
+        op_desc->GetAttrMap(), program_.scope.get());
    // infer shape to reshape inputs and outputs before predict,
    // but for lod mode, it still need to infer shape in runtime
    if (!lod_mode) {
@@ -80,6 +80,8 @@ Executor<Device, T>::Executor(const Program<Device> &program,
  } else {
    InitMemory();
  }
+  // resize feed and fetch list
+  InitFeedFetchList();

  int count = 0;
  for (auto &op_handler : ops_of_block0_) {
@@ -88,6 +90,33 @@ Executor<Device, T>::Executor(const Program<Device> &program,
  }
 }

+template <typename Device, typename T>
+void Executor<Device, T>::InitFeedFetchList() {
+  std::unordered_map<std::string, int> feed_indices, fetch_indices;
+  for (const auto &block : program_desc_->Blocks()) {
+    for (const auto &op_desc : block->Ops()) {
+      if (op_desc->Type() == "feed") {
+        std::string name = op_desc->Output("Out")[0];
+        feed_indices[name] = op_desc->GetAttr("col").Get<int>();
+      } else if (op_desc->Type() == "fetch") {
+        std::string name = op_desc->Input("X")[0];
+        fetch_indices[name] = op_desc->GetAttr("col").Get<int>();
+      }
+    }
+  }
+  feed_indices_.swap(feed_indices);
+  fetch_indices_.swap(fetch_indices);
+
+  auto *feed_var = program_.scope->Var("feed");
+  auto *feed_list = feed_var->template GetMutable<framework::LoDTensorArray>();
+  feed_list->resize(feed_indices_.size());
+
+  auto *fetch_var = program_.scope->Var("fetch");
+  auto *fetch_list =
+      fetch_var->template GetMutable<framework::LoDTensorArray>();
+  fetch_list->resize(fetch_indices_.size());
+}
+
 template <typename T>
 static void LoadMemInternal(void **data, LoDTensor *tensor,
                            bool quant_uint8 = false) {
@@ -182,6 +211,7 @@ void Executor<Device, T>::InitMemory() {
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
      } else {
+        DLOG << "init no persistable var: " << var_desc->Name();
        varInputMemory(var_desc, var);
      }
    }
@@ -319,11 +349,19 @@ PMStatus Executor<Device, T>::Predict(
 template <typename Device, typename T>
 std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
+  PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0,
+                        "We don't know which tensor should be assign, since no "
+                        "feed op found in this model");
+  PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0,
+                        "We don't know which tensor should be fetch out, since "
+                        "no fetch op found in this model");
+  std::string input_name = feed_indices_.begin()->first;
  Tensor feed_tensor(input, make_ddim(dims));
-  SetInput(feed_tensor, "feed");
+  SetInput(feed_tensor, input_name);
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
-    const auto output_tensor = GetOutput("fetch");
+    std::string output_name = fetch_indices_.begin()->first;
+    const auto output_tensor = GetOutput(output_name);
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
@@ -334,12 +372,18 @@ std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
 template <typename Device, typename T>
 void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
-  auto *target_var = program_.scope->FindVar(var_name);
-  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
-                        var_name.c_str());
-
-  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
-
+  framework::LoDTensor *target = nullptr;
+  if (feed_indices_.find(var_name) != feed_indices_.end()) {
+    int index = feed_indices_.find(var_name)->second;
+    auto *feed_var = program_.scope->Var("feed");
+    target = &(
+        feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
+  } else {
+    auto *target_var = program_.scope->FindVar(var_name);
+    PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                          var_name.c_str());
+    target = target_var->template GetMutable<LoDTensor>();
+  }
  if (config_.load_when_predict) {
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
@@ -347,28 +391,53 @@ void Executor<Device, T>::SetInput(const Tensor &input,
    }
  }

-  target_tensor->Resize(input.dims());
-  target_tensor->ShareDataWith(input);
+  target->Resize(input.dims());
+  target->ShareDataWith(input);
 }

 template <typename Device, typename T>
 void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
-  auto *target_var = program_.scope->FindVar(var_name);
-  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
-                        var_name.c_str());
-  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
-
+  framework::LoDTensor *target = nullptr;
+  if (feed_indices_.find(var_name) != feed_indices_.end()) {
+    int index = feed_indices_.find(var_name)->second;
+    auto *feed_var = program_.scope->Var("feed");
+    target = &(
+        feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
+  } else {
+    auto *target_var = program_.scope->FindVar(var_name);
+    PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                          var_name.c_str());
+    target = target_var->template GetMutable<LoDTensor>();
+  }
  if (config_.load_when_predict) {
    if (input_dim_last_ != input.dims()) {
-      InitNoPersistableMemory(*target_tensor);
+      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
    }
  }

-  target_tensor->Resize(input.dims());
-  target_tensor->ShareDataWith(input);
-  target_tensor->set_lod(input.lod());
+  target->Resize(input.dims());
+  target->ShareDataWith(input);
+  target->set_lod(input.lod());
+}
+
+template <typename Device, typename T>
+std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
+    const std::string &var_name) {
+  framework::LoDTensor *target = nullptr;
+  if (fetch_indices_.find(var_name) != fetch_indices_.end()) {
+    int index = fetch_indices_.find(var_name)->second;
+    auto *fetch_var = program_.scope->Var("fetch");
+    target = &(
+        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(index));
+  } else {
+    auto *target_var = program_.scope->FindVar(var_name);
+    PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                          var_name.c_str());
+    target = target_var->template GetMutable<LoDTensor>();
+  }
+  return std::make_shared<LoDTensor>(*target);
 }

 template <typename Device, typename T>
@@ -432,16 +501,6 @@ PMStatus Executor<Device, T>::Predict() {
  return PMSuccess;
 }

-template <typename Device, typename T>
-std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
-    const std::string &var_name) {
-  auto *target_var = program_.scope->FindVar(var_name);
-  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
-                        var_name.c_str());
-  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
-  return std::make_shared<LoDTensor>(*output_tensor);
-}
-
 #ifdef PADDLE_MOBILE_FPGA
 template <typename Device, typename T>
 void Executor<Device, T>::InjectVariable(const Tensor &t,

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -63,6 +63,7 @@ class Executor {

  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc,
                      Variable *var) const;
+  void InitFeedFetchList();
  void InitMemory();
  void InitCombineMemory();
  void InitNoPersistableMemory(const Tensor &input_tensor);
@@ -79,6 +80,8 @@ class Executor {
  Program<Device> program_;
  std::shared_ptr<ProgramDesc> program_desc_;
  std::vector<std::shared_ptr<OperatorBase<Device>>> ops_of_block0_;
+  std::unordered_map<std::string, int> feed_indices_;
+  std::unordered_map<std::string, int> fetch_indices_;

  // for super resoltion
  DDim input_dim_last_;

--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -221,6 +221,8 @@ inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
    } else if (tensor.type() == typeid(int32_t)) {
      printer << tensor.data<int32_t>()[i] << " ";
+    } else if (tensor.type() == typeid(bool)) {
+      printer << tensor.data<bool>()[i] << " ";
    }
  }
 #endif  // PADDLE_MOBILE_FPGA

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -58,8 +58,7 @@ struct OpInfoFiller {
  void operator()(const std::string& op_type, OpInfo<Dtype>* info) const {
    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
                        const VariableNameMap& outputs,
-                        const AttributeMap& attrs,
-                        std::shared_ptr<Scope> scope) {
+                        const AttributeMap& attrs, framework::Scope* scope) {
      return new T(type, inputs, outputs, attrs, scope);
    };
  }
@@ -91,7 +90,7 @@ class OpRegistry {
  static std::shared_ptr<OperatorBase<Dtype>> CreateOp(
      const std::string& type, const VariableNameMap& inputs,
      const VariableNameMap& outputs, const AttributeMap attrs,
-      std::shared_ptr<paddle_mobile::framework::Scope> scope) {
+      paddle_mobile::framework::Scope* scope) {
    auto& info = OpInfoMap<Dtype>::Instance()->Get(type);
    auto op = info.Creator()(type, inputs, outputs, attrs, scope);
    return std::shared_ptr<OperatorBase<Dtype>>(op);

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -43,7 +43,7 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                  const VariableNameMap &inputs,
                                  const VariableNameMap &outputs,
                                  const AttributeMap &attrs,
-                                  std::shared_ptr<Scope> scope)
+                                  framework::Scope *scope)
    : type_(type),
      inputs_(inputs),
      outputs_(outputs),

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -57,7 +57,7 @@ class OperatorBase {
 public:
  OperatorBase(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs, const AttributeMap &attrs,
-               std::shared_ptr<Scope> scope);
+               framework::Scope *scope);
  virtual ~OperatorBase() {}

  virtual void Init() = 0;
@@ -80,7 +80,7 @@ class OperatorBase {
  }

 protected:
-  std::shared_ptr<Scope> scope_;
+  framework::Scope *scope_;
  std::string type_;
  VariableNameMap inputs_;
  VariableNameMap outputs_;
@@ -95,7 +95,7 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
 public:
  OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     std::shared_ptr<Scope> scope)
+                     framework::Scope *scope)
      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
        param_(inputs, outputs, attrs, *scope) {
 #ifdef PADDLE_MOBILE_CL
@@ -174,21 +174,20 @@ class FusionOpMatcher {
  std::shared_ptr<OpDesc> new_opdesc_;
 };

-#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                          \
-  template <typename DeviceType, typename T>                                 \
-  class OpName##Op : public framework::OperatorWithKernel<                   \
-                         DeviceType, OpParam<DeviceType>,                    \
-                         operators::OpKernel<DeviceType, T>> {               \
-   public:                                                                   \
-    OpName##Op(const std::string &type, const VariableNameMap &inputs,       \
-               const VariableNameMap &outputs,                               \
-               const framework::AttributeMap &attrs,                         \
-               std::shared_ptr<framework::Scope> scope)                      \
-        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,     \
-                                        operators::OpKernel<DeviceType, T>>( \
-              type, inputs, outputs, attrs, scope) {}                        \
-                                                                             \
-    void InferShape() const override;                                        \
+#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                           \
+  template <typename DeviceType, typename T>                                  \
+  class OpName##Op : public framework::OperatorWithKernel<                    \
+                         DeviceType, OpParam<DeviceType>,                     \
+                         operators::OpKernel<DeviceType, T>> {                \
+   public:                                                                    \
+    OpName##Op(const std::string &type, const VariableNameMap &inputs,        \
+               const VariableNameMap &outputs,                                \
+               const framework::AttributeMap &attrs, framework::Scope *scope) \
+        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,      \
+                                        operators::OpKernel<DeviceType, T>>(  \
+              type, inputs, outputs, attrs, scope) {}                         \
+                                                                              \
+    void InferShape() const override;                                         \
  };

 #define DECLARE_KERNEL(OpName, OpParam)                                   \
@@ -204,7 +203,7 @@ class FusionOpMatcher {
  cls(const std::string &type, const ::paddle_mobile::VariableNameMap &inputs, \
      const ::paddle_mobile::VariableNameMap &outputs,                         \
      const ::paddle_mobile::framework::AttributeMap &attrs,                   \
-      std::shared_ptr<::paddle_mobile::framework::Scope> scope)                \
+      ::paddle_mobile::framework::Scope *scope)                                \
      : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}

 }  // namespace framework

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -32,15 +32,7 @@ class Scope {
  Scope() = default;

  ~Scope() {
-    for (auto &var : vars_) {
-      delete var.second;
-    }
-    vars_.clear();
-    for (auto kid : kids_) {
-      delete kid;
-    }
-    kids_.clear();
-
+    DropKids();
 #ifdef PADDLE_MOBILE_CL
    delete cl_scope_;
 #endif

--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -32,8 +32,7 @@ class BatchNormOp
 public:
  BatchNormOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              std::shared_ptr<framework::Scope> scope)
+              const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, BatchNormParam<DeviceType>,
                                      BatchNormKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
@@ -34,8 +34,7 @@ class BilinearOp : public framework::OperatorWithKernel<
 public:
  BilinearOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, BilinearInterpParam<DeviceType>,
            operators::BilinearInterpKernel<DeviceType, T>>(

--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -34,8 +34,7 @@ class BoxCoderOp : public framework::OperatorWithKernel<
 public:
  BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
                                      operators::BoxCoderKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/cast_op.h
+++ b/src/operators/cast_op.h
@@ -31,7 +31,7 @@ class CastOp : public framework::OperatorWithKernel<
 public:
  CastOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, CastParam<DeviceType>,
                                      operators::CastKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -30,7 +30,7 @@ class ConcatOp : public framework::OperatorWithKernel<
 public:
  ConcatOp(const string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           std::shared_ptr<framework::Scope> scope)
+           framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
                                      operators::ConcatKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -30,7 +30,7 @@ class ConvOp : public framework::OperatorWithKernel<
 public:
  ConvOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/conv_transpose_op.h
+++ b/src/operators/conv_transpose_op.h
@@ -31,8 +31,7 @@ class ConvOpTranspose : public framework::OperatorWithKernel<
 public:
  ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ConvTransposeParam<DeviceType>,
            operators::ConvTransposeKernel<DeviceType, T>>(

--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
@@ -33,7 +33,7 @@ class CrfOp : public framework::OperatorWithKernel<
 public:
  CrfOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
                                      operators::CrfKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -30,8 +30,7 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 public:
  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/dequantize_op.h
+++ b/src/operators/dequantize_op.h
@@ -32,8 +32,7 @@ class DequantizeOp
 public:
  DequantizeOp(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               std::shared_ptr<framework::Scope> scope)
+               const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
                                      DequantizeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -34,7 +34,7 @@ class DropoutOp : public framework::OperatorWithKernel<
 public:
  DropoutOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs, const framework::AttributeMap attrs,
-            std::shared_ptr<framework::Scope> scope)
+            framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
                                      operators::DropoutKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -32,7 +32,7 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseAddParam<DeviceType>,
            operators::ElementwiseAddKernel<DeviceType, T>>(

--- a/src/operators/elementwise_mul_op.h
+++ b/src/operators/elementwise_mul_op.h
@@ -32,7 +32,7 @@ class ElementwiseMulOp : public framework::OperatorWithKernel<
  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseMulParam<DeviceType>,
            operators::ElementwiseMulKernel<DeviceType, T>>(

--- a/src/operators/elementwise_sub_op.h
+++ b/src/operators/elementwise_sub_op.h
@@ -32,7 +32,7 @@ class ElementwiseSubOp : public framework::OperatorWithKernel<
  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseSubParam<DeviceType>,
            operators::ElementwiseSubKernel<DeviceType, T>>(

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -31,7 +31,7 @@ class FeedOp
 public:
  FeedOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)

      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
                                      FeedKernel<DeviceType, T>>(

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -18,8 +18,9 @@ namespace operators {

 template <typename DeviceType, typename T>
 void FetchOp<DeviceType, T>::InferShape() const {
+  int col = this->param_.Col();
  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
+  this->param_.Out()->at(col).Resize(x_dims);
 }

 }  // namespace operators

--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -30,7 +30,7 @@ class FetchOp
 public:
  FetchOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
                                      FetchKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
@@ -31,8 +31,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
 public:
  FillConstantOp(const std::string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap attrs, framework::Scope *scope)
      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                            scope),
        param_(inputs, outputs, attrs, *scope) {}

--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -49,8 +49,7 @@ class FlattenOp : public framework::OperatorWithKernel<
 public:
  FlattenOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            std::shared_ptr<framework::Scope> scope)
+            const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
                                      operators::FlattenKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
@@ -63,7 +63,7 @@ class FusionConvAddAddPReluOp
  FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
+                          framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddAddPReluParam<DeviceType>,
            operators::ConvAddAddPReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_add_bn_op.h
+++ b/src/operators/fusion_conv_add_bn_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/conv_add_bn_kernel.h"
+#include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
@@ -59,7 +59,7 @@ class FusionConvAddBNOp : public framework::OperatorWithKernel<
  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddBNParam<DeviceType>,
            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -61,7 +61,7 @@ class FusionConvAddBNReluOp
  FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddBNReluParam<DeviceType>,
            operators::ConvAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_add_op.h
+++ b/src/operators/fusion_conv_add_op.h
@@ -50,8 +50,7 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 public:
  FusionConvAddOp(const string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType,
                                      FusionConvAddParam<DeviceType>,
                                      operators::ConvAddKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
@@ -54,7 +54,7 @@ class FusionConvAddPReluOp
  FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddPReluParam<DeviceType>,
            operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -51,7 +51,7 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs,
                      const framework::AttributeMap &attrs,
-                      std::shared_ptr<framework::Scope> scope)
+                      framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddReluParam<DeviceType>,
            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -67,7 +67,7 @@ class FusionConvBNAddReluOp
  FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvBNAddReluParam<DeviceType>,
            operators::ConvBNAddReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_bn_op.h
+++ b/src/operators/fusion_conv_bn_op.h
@@ -56,8 +56,7 @@ class FusionConvBNOp : public framework::OperatorWithKernel<
 public:
  FusionConvBNOp(const string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FusionConvBNParam<DeviceType>,
                                      operators::ConvBNKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -58,7 +58,7 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs,
                     const framework::AttributeMap &attrs,
-                     std::shared_ptr<framework::Scope> scope)
+                     framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvBNReluParam<DeviceType>,
            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_op.h
+++ b/src/operators/fusion_deconv_add_op.h
@@ -49,7 +49,7 @@ class FusionDeconvAddOp : public framework::OperatorWithKernel<
  FusionDeconvAddOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddParam<DeviceType>,
            operators::DeconvAddKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_relu_op.h
+++ b/src/operators/fusion_deconv_add_relu_op.h
@@ -51,7 +51,7 @@ class FusionDeconvAddReluOp
  FusionDeconvAddReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddReluParam<DeviceType>,
            operators::DeconvAddReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_deconv_relu_op.h
+++ b/src/operators/fusion_deconv_relu_op.h
@@ -48,7 +48,7 @@ class FusionDeconvReluOp : public framework::OperatorWithKernel<
  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs,
                     const framework::AttributeMap &attrs,
-                     std::shared_ptr<framework::Scope> scope)
+                     framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvReluParam<DeviceType>,
            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_dequant_add_bn_op.h
+++ b/src/operators/fusion_dequant_add_bn_op.h
@@ -60,7 +60,7 @@ class FusionDequantAddBNOp
  FusionDequantAddBNOp(const std::string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNParam<DeviceType>,
            operators::FusionDequantAddBNKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
@@ -62,7 +62,7 @@ class FusionDequantAddBNReluOp
                           const VariableNameMap &inputs,
                           const VariableNameMap &outputs,
                           const framework::AttributeMap &attrs,
-                           std::shared_ptr<framework::Scope> scope)
+                           framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNParam<DeviceType>,
            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_quant_op.h
@@ -62,7 +62,7 @@ class FusionDequantAddBNReluQuantOp
                                const VariableNameMap &inputs,
                                const VariableNameMap &outputs,
                                const framework::AttributeMap &attrs,
-                                std::shared_ptr<framework::Scope> scope)
+                                framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
            operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
@@ -109,7 +109,7 @@ class FusionDequantAddBNQuantOp
                            const VariableNameMap &inputs,
                            const VariableNameMap &outputs,
                            const framework::AttributeMap &attrs,
-                            std::shared_ptr<framework::Scope> scope)
+                            framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
            operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_bn_op.h
+++ b/src/operators/fusion_dequant_bn_op.h
@@ -58,7 +58,7 @@ class FusionDequantBNOp : public framework::OperatorWithKernel<
  FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantBNParam<DeviceType>,
            operators::FusionDequantBNKernel<DeviceType, T>>(
@@ -87,7 +87,7 @@ class FusionDequantBNReluOp
  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantBNParam<DeviceType>,
            operators::FusionDequantBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_dequant_bn_relu_op.h
+++ b/src/operators/fusion_dequant_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDequantBNReluOp
  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDequantBNReluParam<DeviceType>,
            operators::FusionDequantBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDWConvBNReluOp
  FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const framework::AttributeMap &attrs,
-                       std::shared_ptr<framework::Scope> scope)
+                       framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDWConvBNReluParam<DeviceType>,
            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_elementwise_add_relu_op.h
+++ b/src/operators/fusion_elementwise_add_relu_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #pragma once

 #include <string>
+#include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
 #include "operators/kernel/elementwise_add_relu_kernel.h"
@@ -50,7 +51,7 @@ class FusionElementwiseAddReluOp
  FusionElementwiseAddReluOp(const string &type, const VariableNameMap &inputs,
                             const VariableNameMap &outputs,
                             const framework::AttributeMap &attrs,
-                             std::shared_ptr<framework::Scope> scope)
+                             framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseAddReluParam<DeviceType>,
            operators::ElementwiseAddReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -50,8 +50,7 @@ class FusionFcOp : public framework::OperatorWithKernel<
 public:
  FusionFcOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
                                      operators::FusionFcKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
@@ -49,8 +49,7 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
 public:
  FusionFcReluOp(const string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionFcReluParam<DeviceType>,
            operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
@@ -33,7 +33,7 @@ class GruOp : public framework::OperatorWithKernel<
 public:
  GruOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
                                      operators::GruKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/gru_unit_op.h
+++ b/src/operators/gru_unit_op.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/gru_unit_kernel.h"
 #include "operators/op_param.h"
@@ -30,10 +31,10 @@ class GruUnitOp : public framework::OperatorWithKernel<
 public:
  GruUnitOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs, const AttributeMap &attrs,
-            std::shared_ptr<Scope> scope)
+            framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, GruUnitParam<DeviceType>,
                                      operators::GruUnitKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope){};
+            type, inputs, outputs, attrs, scope) {}
  void InferShape() const override;
 };


--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -31,8 +31,7 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
 public:
  Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
                const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                std::shared_ptr<framework::Scope> scope)
+                const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, Im2SequenceParam<DeviceType>,
            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/increment_op.h
+++ b/src/operators/increment_op.h
@@ -32,8 +32,7 @@ class IncrementOp
 public:
  IncrementOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              std::shared_ptr<framework::Scope> scope)
+              const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, IncrementParam<DeviceType>,
                                      IncrementKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/is_empty_op.h
+++ b/src/operators/is_empty_op.h
@@ -31,8 +31,7 @@ class IsEmptyOp
 public:
  IsEmptyOp(const string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            std::shared_ptr<framework::Scope> scope)
+            const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, IsEmptyParam<DeviceType>,
                                      IsEmptyKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/kernel/arm/beam_search_decode_kernel.cpp
+++ b/src/operators/kernel/arm/beam_search_decode_kernel.cpp
@@ -29,6 +29,13 @@ template <>
 void BeamSearchDecodeKernel<CPU, float>::Compute(
    const BeamSearchDecodeParam<CPU> &param) {
  // TODO(hjchen2)
+  DLOG << "BeamSearchDecodeKernel";
+  param.sentence_scores_->Resize(framework::make_ddim({10}));
+  param.sentence_scores_->mutable_data<float>();
+  DLOG << "BeamSearchDecodeKernel";
+
+  param.sentence_ids_->Resize(framework::make_ddim({10}));
+  param.sentence_ids_->mutable_data<int64_t>();
 }

 }  // namespace operators

--- a/src/operators/kernel/arm/fetch_kernel.cpp
+++ b/src/operators/kernel/arm/fetch_kernel.cpp
@@ -8,17 +8,24 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "operators/kernel/fetch_kernel.h"
+
 namespace paddle_mobile {
 namespace operators {
+
 template <>
 bool FetchKernel<CPU, float>::Init(FetchParam<CPU> *param) {
  return true;
 }
+
 template <>
 void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
+  int col = param.Col();
+  param.Out()->at(col).ShareDataWith(*(param.InputX()));
 }
+
 template class FetchKernel<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/arm/sequence_expand_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_expand_kernel.cpp
@@ -100,8 +100,8 @@ class SequenceExpandKernel<CPU, T>
          out_lod.push_back(out_lod.back() + x_seq_len);
        }
      }
+      output->set_lod({out_lod});
    }
-    output->set_lod({out_lod});
    SequenceExpandImpl<T>(*input_x, y_lod[ref_level], output);
  }
 };

--- a/src/operators/kernel/arm/sequence_softmax_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_softmax_kernel.cpp
@@ -28,10 +28,13 @@ class SequenceSoftmaxKernel<CPU, T>
  bool Init(SoftmaxParam<CPU> *param) { return true; }

  void Compute(const SoftmaxParam<CPU> &param) {
-    const framework::LoDTensor *input = param.InputX();
-    framework::LoDTensor *output = param.Out();
-    math::SequenceSoftmaxFuntor<CPU, T> sequence_softmax;
-    sequence_softmax(input, output);
+    param.Out()->mutable_data<float>();
+    /*
+        const framework::LoDTensor *input = param.InputX();
+        framework::LoDTensor *output = param.Out();
+        math::SequenceSoftmaxFuntor<CPU, T> sequence_softmax;
+        sequence_softmax(input, output);
+    */
  }
 };


--- a/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+++ b/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
@@ -51,6 +51,11 @@ void ReadFromArrayKernel<CPU, float>::Compute(
  int64_t offset = param.index_->data<int64_t>()[0];
  if (offset < param.input_->size()) {
    TensorCopy(param.input_->at(offset), param.output_);
+    param.output_->set_lod(param.input_->at(offset).lod());
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(
+        "Can not read tensor which index is `%d` since it only has `%d` inputs",
+        offset, param.input_->size());
  }
 }
 #endif  // READ_FROM_ARRAY_OP

--- a/src/operators/kernel/arm/while_kernel.cpp
+++ b/src/operators/kernel/arm/while_kernel.cpp
@@ -26,11 +26,12 @@ class StepExecutor {

 public:
  StepExecutor(const framework::BlockDesc *block, framework::Scope *scope)
-      : scope_(std::shared_ptr<framework::Scope>(scope)) {
+      : scope_(scope) {
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block->Ops();
    ops_of_block_.resize(ops.size());
    for (int i = 0; i < ops.size(); ++i) {
      std::shared_ptr<framework::OpDesc> op_desc = ops[i];
+      DLOG << "create op: " << op_desc->Type();
      auto op_handler = framework::OpRegistry<CPU>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), scope_);
@@ -40,15 +41,13 @@ class StepExecutor {

  void Run() {
    for (auto &op_handler : ops_of_block_) {
-      DLOG << "run op: " << op_handler->Type();
      op_handler->InferShape();
      op_handler->Run();
-      DLOG << "run op finish";
    }
  }

 private:
-  std::shared_ptr<framework::Scope> scope_;
+  framework::Scope *scope_;
  std::vector<OperatorPtr> ops_of_block_;
 };

@@ -59,7 +58,6 @@ bool WhileKernel<CPU, float>::Init(WhileParam<CPU> *param) {

 template <>
 void WhileKernel<CPU, float>::Compute(const WhileParam<CPU> &param) {
-  // TODO(hjchen2)
  auto &current_scope = param.scope_->NewScope();
  StepExecutor executor(param.sub_block_, &current_scope);
  while (param.cond_->data<bool>()[0]) {

--- a/src/operators/kernel/central-arm-func/increment_arm_func.h
+++ b/src/operators/kernel/central-arm-func/increment_arm_func.h
@@ -25,11 +25,11 @@ template <typename P>
 void IncrementCompute(const IncrementParam<CPU> &param) {
  const framework::Tensor *input = param.InputX();
  framework::Tensor *out = param.Out();
-  int step = param.Step();
+  float step = param.Step();

-  out->mutable_data<P>();
-  const P *input_data = input->data<P>();
-  P *out_data = out->data<P>();
+  out->mutable_data<int64_t>();
+  const int64_t *input_data = input->data<int64_t>();
+  int64_t *out_data = out->data<int64_t>();
  *out_data = *input_data + step;
 }


--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
@@ -33,7 +33,7 @@ class LookupOp : public framework::OperatorWithKernel<
 public:
  LookupOp(const std::string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           std::shared_ptr<framework::Scope> scope)
+           framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, LookupParam<DeviceType>,
                                      operators::LookupKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -31,7 +31,7 @@ class LrnOp : public framework::OperatorWithKernel<
 public:
  LrnOp(const string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, LrnParam<DeviceType>,
                                      operators::LrnKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
@@ -47,26 +47,27 @@ void DepthwiseConv3x3(const framework::Tensor *input,
  const int output_channel_stride = output_height * output_width;
  const int filter_channel_stride = 9;

-  const float *input_data = input->data<float>();
-  const float *filter_data = filter->data<float>();
+  const float *input_ptr = input->data<float>();
+  const float *filter_ptr = filter->data<float>();
  if (if_bias) {
    math::expand_bias(*bias, 1, output->dims());
    output->ShareDataWith(*bias);
  }
-  float *output_data = output->mutable_data<float>();
+  float *output_ptr = output->mutable_data<float>();

-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  const int filter_batch_stride = output_channels * output_channel_stride;
-  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
+  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr2;
  int hstart, wstart, hend, wend;
  float result;
  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
    for (int c = 0; c < output_channels; ++c) {
-      filter1 = filter_data;
+      const float *input_data =
+          input_ptr + (i * output_channels + c) * input_channel_stride;
+      float *output_data =
+          output_ptr + (i * output_channels + c) * output_channel_stride;
+      filter1 = filter_ptr + c * filter_channel_stride;
      filter2 = filter1 + 3;
      filter3 = filter2 + 3;
-
      for (int ph = 0; ph < output_height; ph++) {
        for (int pw = 0; pw < output_width; pw++) {
          hstart = ph * stride_height - padding_height;
@@ -80,7 +81,7 @@ void DepthwiseConv3x3(const framework::Tensor *input,
          pos1 = input_data + hstart * input_width + wstart;
          pos2 = input_data + (hstart + 1) * input_width + wstart;
          pos3 = input_data + (hstart + 2) * input_width + wstart;
-          output_ptr = output_data + ph * output_width + pw;
+          output_ptr2 = output_data + ph * output_width + pw;

          if (hend - hstart != 3 || wend - wstart != 3) {
            result = 0;
@@ -230,7 +231,7 @@ void DepthwiseConv3x3(const framework::Tensor *input,
                : [input_data] "r"(input_data), [pos1] "r"(pos1),
                  [pos2] "r"(pos2), [pos3] "r"(pos3), [filter1] "r"(filter1),
                  [filter2] "r"(filter2), [filter3] "r"(filter3),
-                  [output_ptr] "r"(output_ptr), [zero] "r"(zero)
+                  [output_ptr] "r"(output_ptr2), [zero] "r"(zero)
                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
 #endif  // __aarch64__
 #else
@@ -239,12 +240,7 @@ void DepthwiseConv3x3(const framework::Tensor *input,
          }
        }
      }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
-      filter_data += filter_channel_stride;
    }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
  }
 }


--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -31,7 +31,7 @@ class MulOp : public framework::OperatorWithKernel<
 public:
  MulOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, MulParam<DeviceType>,
                                      operators::MulKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -34,8 +34,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
 public:
  MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, MultiClassNMSParam<DeviceType>,
            operators::MultiClassNMSKernel<DeviceType, T>>(

--- a/src/operators/norm_op.h
+++ b/src/operators/norm_op.h
@@ -31,7 +31,7 @@ class NormOp
 public:
  NormOp(const string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, NormParam<DeviceType>,
                                      NormKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1198,20 +1198,19 @@ class FetchParam : public OpParam {
 public:
  FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
             const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<GType>(inputs, scope);
-    out_ = OutFrom(outputs, scope);
+    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<framework::LoDTensorArray>(outputs, scope);
+    col_ = GetAttr<int>("col", attrs);
  }

-  const RType *InputX() const { return input_x_; }
-  Tensor *Out() const { return out_; }
-
-  static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<LoDTensor>("Out", outputs, scope);
-  }
+  const framework::LoDTensor *InputX() const { return input_x_; }
+  framework::LoDTensorArray *Out() const { return out_; }
+  const int Col() const { return col_; }

 private:
-  RType *input_x_;
-  Tensor *out_;
+  framework::LoDTensor *input_x_;
+  framework::LoDTensorArray *out_;
+  int col_;
 #ifdef PADDLE_MOBILE_FPGA

 private:
@@ -2664,9 +2663,9 @@ class TopKParam : public OpParam {
  }

 public:
-  RType *input_;
-  RType *output_;
-  RType *indices_;
+  GType *input_;
+  GType *output_;
+  GType *indices_;
  int k_;
 };
 #endif  // TOP_K_OP

--- a/src/operators/polygon_box_transform_op.h
+++ b/src/operators/polygon_box_transform_op.h
@@ -36,7 +36,7 @@ class PolygonBoxTransformOp
  PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, PolygonBoxTransformParam<DeviceType>,
            operators::PolygonBoxTransformKernel<DeviceType, T>>(

--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -24,19 +24,17 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
-using framework::AttributeMap;
-using framework::OperatorWithKernel;
-using framework::Scope;
-using std::string;
+
 template <typename DeviceType, typename T>
-class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
-                                         operators::PoolKernel<DeviceType, T>> {
+class PoolOp : public framework::OperatorWithKernel<
+                   DeviceType, PoolParam<DeviceType>,
+                   operators::PoolKernel<DeviceType, T>> {
 public:
-  PoolOp(const string &type, const VariableNameMap &inputs,
+  PoolOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const AttributeMap &attrs,
-         std::shared_ptr<Scope> scope)
-      : OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
-                           operators::PoolKernel<DeviceType, T>>(
+         framework::Scope *scope)
+      : framework::OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
+                                      operators::PoolKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
  void InferShape() const override;


--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
@@ -34,7 +34,7 @@ class PReluOp : public framework::OperatorWithKernel<
 public:
  PReluOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, PReluParam<DeviceType>,
                                      operators::PReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -34,8 +34,7 @@ class PriorBoxOp : public framework::OperatorWithKernel<
 public:
  PriorBoxOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, PriorBoxParam<DeviceType>,
                                      operators::PriorBoxKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/quantize_op.h
+++ b/src/operators/quantize_op.h
@@ -31,8 +31,7 @@ class QuantizeOp : public framework::OperatorWithKernel<
 public:
  QuantizeOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, QuantizeParam<DeviceType>,
                                      operators::QuantizeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/reshape2_op.h
+++ b/src/operators/reshape2_op.h
@@ -34,8 +34,7 @@ class Reshape2Op : public framework::OperatorWithKernel<
 public:
  Reshape2Op(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, Reshape2Param<DeviceType>,
                                      operators::Reshape2Kernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -34,8 +34,7 @@ class ReshapeOp : public framework::OperatorWithKernel<
 public:
  ReshapeOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            std::shared_ptr<framework::Scope> scope)
+            const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ReshapeParam<DeviceType>,
                                      operators::ReshapeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
@@ -34,7 +34,7 @@ class ResizeOp : public framework::OperatorWithKernel<
 public:
  ResizeOp(const std::string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           std::shared_ptr<framework::Scope> scope)
+           framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ResizeParam<DeviceType>,
                                      operators::ResizeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
@@ -34,7 +34,7 @@ class ScaleOp : public framework::OperatorWithKernel<
 public:
  ScaleOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ScaleParam<DeviceType>,
                                      operators::ScaleKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/sequence_ops/sequence_expand_op.h
+++ b/src/operators/sequence_ops/sequence_expand_op.h
@@ -32,7 +32,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel<
  SequenceExpandOp(const std::string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, SequenceExpandParam<DeviceType>,
            operators::SequenceExpandKernel<DeviceType, T>>(

--- a/src/operators/sequence_ops/sequence_pool_op.h
+++ b/src/operators/sequence_ops/sequence_pool_op.h
@@ -31,8 +31,7 @@ class SequencePoolOp : public framework::OperatorWithKernel<
 public:
  SequencePoolOp(const std::string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, SequencePoolParam<DeviceType>,
            operators::SequencePoolKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/sequence_ops/sequence_softmax_op.h
+++ b/src/operators/sequence_ops/sequence_softmax_op.h
@@ -32,7 +32,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel<
  SequenceSoftmaxOp(const std::string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, SoftmaxParam<DeviceType>,
            operators::SequenceSoftmaxKernel<DeviceType, T>>(

--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
@@ -34,7 +34,7 @@ class ShapeOp : public framework::OperatorWithKernel<
 public:
  ShapeOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ShapeParam<DeviceType>,
                                      operators::ShapeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
@@ -34,7 +34,7 @@ class SliceOp : public framework::OperatorWithKernel<
 public:
  SliceOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, SliceParam<DeviceType>,
                                      operators::SliceKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -31,8 +31,7 @@ class SoftmaxOp : public framework::OperatorWithKernel<
 public:
  SoftmaxOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            std::shared_ptr<framework::Scope> scope)
+            const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, SoftmaxParam<DeviceType>,
                                      operators::SoftmaxKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
@@ -34,7 +34,7 @@ class SplitOp : public framework::OperatorWithKernel<
 public:
  SplitOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, SplitParam<DeviceType>,
                                      operators::SplitKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/sum_op.h
+++ b/src/operators/sum_op.h
@@ -30,7 +30,7 @@ class SumOp : public framework::OperatorWithKernel<
 public:
  SumOp(const string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, SumParam<DeviceType>,
                                      operators::SumKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/top_k_op.cpp
+++ b/src/operators/top_k_op.cpp
@@ -26,7 +26,11 @@ void TopKOp<DeviceType, T>::InferShape() const {
  // should check k <= dims[-1] && k >= 1
  dims[dims.size() - 1] = k;
  this->param_.output_->Resize(dims);
+  //  this->param_.output_->set_lod(this->param_.input_->lod());
+  this->param_.output_->set_lod({{0, 1}});
  this->param_.indices_->Resize(dims);
+  //  this->param_.indices_->set_lod(this->param_.input_->lod());
+  this->param_.indices_->set_lod({{0, 1}});
 }

 }  // namespace operators

--- a/src/operators/top_k_op.h
+++ b/src/operators/top_k_op.h
@@ -31,7 +31,7 @@ class TopKOp : public framework::OperatorWithKernel<
 public:
  TopKOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, TopKParam<DeviceType>,
                                      operators::TopKKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/transpose2_op.h
+++ b/src/operators/transpose2_op.h
@@ -34,8 +34,7 @@ class Transpose2Op : public framework::OperatorWithKernel<
 public:
  Transpose2Op(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               std::shared_ptr<framework::Scope> scope)
+               const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, Transpose2Param<DeviceType>,
            operators::Transpose2Kernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -34,8 +34,7 @@ class TransposeOp : public framework::OperatorWithKernel<
 public:
  TransposeOp(const std::string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              std::shared_ptr<framework::Scope> scope)
+              const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, TransposeParam<DeviceType>,
            operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,

--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -69,7 +69,7 @@ class Executor4Test : public Executor<DeviceType> {
        std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
            op_ptr = paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
                op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-                this->program_.scope);
+                this->program_.scope.get());
        this->ops_of_block0_.push_back(op_ptr);
        break;
      }
@@ -86,7 +86,7 @@ class Executor4Test : public Executor<DeviceType> {
                                          const vector<string> &input_names,
                                          const vector<string> &output_names,
                                          const vector<DDim> &ddims) {
-    auto scope = this->program_.scope;
+    auto scope = this->program_.scope.get();
    size_t input_size = input_names.size();
    size_t out_size = output_names.size();

@@ -119,7 +119,7 @@ class Executor4Test : public Executor<DeviceType> {

  std::shared_ptr<Tensor> Predict(const Tensor &t, string input, string output,
                                  const DDim &dDim) {
-    auto scope = this->program_.scope;
+    auto scope = this->program_.scope.get();
    Variable *g_feed_value = scope->Var(input);
    auto tensor = g_feed_value->GetMutable<LoDTensor>();
    tensor->ShareDataWith(t);

--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -88,8 +88,8 @@ int TestBatchNormOp(const std::vector<int> input_shape) {
  attrs["epsilon"].Set<float>(eps);
  attrs["momentum"].Set<float>(0.f);

-  auto *op = new operators::BatchNormOp<CPU, float>("batch_norm", inputs,
-                                                    outputs, attrs, scope);
+  auto *op = new operators::BatchNormOp<CPU, float>(
+      "batch_norm", inputs, outputs, attrs, scope.get());
  op->InferShape();
  op->Init();
  op->Run();

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -49,7 +49,7 @@ class TestBoxCoderOp {
          std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
              std::make_shared<operators::BoxCoderOp<Dtype, float>>(
                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
+                  op->GetAttrMap(), program_.scope.get());
          ops_of_block_[*block_desc.get()].push_back(boxcoder);
        }
      }
@@ -59,7 +59,7 @@ class TestBoxCoderOp {
  std::shared_ptr<Tensor> predict_boxcoder(const Tensor &t1, const Tensor &t2,
                                           const Tensor &t3) {
    // feed
-    auto scope = program_.scope;
+    auto scope = program_.scope.get();
    Variable *prior_box = scope->Var("concat_0.tmp_0");
    auto tensor_x1 = prior_box->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

--- a/test/operators/test_cast_op.cpp
+++ b/test/operators/test_cast_op.cpp
@@ -81,8 +81,8 @@ int TestCastOp(const std::vector<int> input_shape) {
  framework::AttributeMap attrs;
  attrs["in_dtype"].Set<int>(TypeInt<Itype>());
  attrs["out_dtype"].Set<int>(TypeInt<Otype>());
-  auto *op =
-      new operators::CastOp<CPU, float>("cast", inputs, outputs, attrs, scope);
+  auto *op = new operators::CastOp<CPU, float>("cast", inputs, outputs, attrs,
+                                               scope.get());
  op->InferShape();
  op->Init();
  op->Run();

--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -27,7 +27,7 @@ using framework::Scope;
 using framework::make_ddim;

 template <typename T>
-void concat(const std::vector<LoDTensor> &input, LoDTensor &output, int axis) {
+void concat(const std::vector<LoDTensor> &input, LoDTensor *output, int axis) {
  int num = input.size();

  int rows = 1;
@@ -45,7 +45,7 @@ void concat(const std::vector<LoDTensor> &input, LoDTensor &output, int axis) {
  }

  // computation
-  auto output_data = output.data<T>();
+  auto output_data = output->data<T>();
  int col_idx = 0;
  for (int j = 0; j < num; ++j) {
    int col_len = input_cols[j];
@@ -99,14 +99,14 @@ int TestConcatOP() {
  attrs["axis"].Set<int>(axis_v);

  auto *op = new operators::ConcatOp<CPU, float>("concat", inputs, outputs,
-                                                 attrs, scope);
+                                                 attrs, scope.get());
  op->InferShape();
  op->Run();
  auto output = output_var->template Get<framework::LoDTensor>();
  const T *output_data = output->data<T>();
  LoDTensor output_cmp;
  output_cmp.mutable_data<T>(output_shape);
-  concat<T>(input_tensors, output_cmp, axis_v);
+  concat<T>(input_tensors, &output_cmp, axis_v);
  const T *output_cmp_data = output_cmp.data<T>();
  // compare
  int eq = 0;

--- a/test/operators/test_conv_bn_relu_op.cpp
+++ b/test/operators/test_conv_bn_relu_op.cpp
@@ -84,7 +84,7 @@ int TestConvBnReluOp(int in_channels, int in_height, int in_width,
  attrs["epsilon"].Set<float>(1e-6);
  attrs["momentum"].Set<float>(0.f);
  auto *op = new operators::FusionConvBNReluOp<CPU, float>(
-      "fusion_conv_bn_relu", inputs, outputs, attrs, scope);
+      "fusion_conv_bn_relu", inputs, outputs, attrs, scope.get());
  op->InferShape();
  op->Init();
  for (int i = 0; i < 10; ++i) {

--- a/test/operators/test_conv_op.cpp
+++ b/test/operators/test_conv_op.cpp
@@ -182,7 +182,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
  attrs["groups"].Set<int>(groups);

  auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
-                                               scope);
+                                               scope.get());
  op->InferShape();
  op->Init();
  //  struct timespec ts_begin, ts_end;

--- a/test/operators/test_dequantize_op.cpp
+++ b/test/operators/test_dequantize_op.cpp
@@ -50,8 +50,8 @@ int TestDequqntizeOp() {
  framework::AttributeMap attrs;
  attrs["weight_scale"].Set<float>(1.74);

-  auto* op = new operators::DequantizeOp<CPU, float>("dequantize", inputs,
-                                                     outputs, attrs, scope);
+  auto* op = new operators::DequantizeOp<CPU, float>(
+      "dequantize", inputs, outputs, attrs, scope.get());
  op->InferShape();
  op->Run();
  auto output = output_var->template Get<framework::LoDTensor>();

--- a/test/operators/test_dwconv_bn_relu_op.cpp
+++ b/test/operators/test_dwconv_bn_relu_op.cpp
@@ -87,7 +87,7 @@ int TestDWConvAddBnReluOp(int in_channels, int in_height, int in_width,
  attrs["momentum"].Set<float>(0.f);

  auto *op = new operators::FusionDWConvBNReluOp<CPU, float>(
-      "fusion_dwconv_bn_relu", inputs, outputs, attrs, scope);
+      "fusion_dwconv_bn_relu", inputs, outputs, attrs, scope.get());
  op->InferShape();
  op->Init();
  for (int i = 0; i < 10; ++i) {

--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
@@ -47,7 +47,7 @@ class TestElementwiseSubOp {
          std::shared_ptr<operators::ElementwiseSubOp<Dtype, float>> lrn =
              std::make_shared<operators::ElementwiseSubOp<Dtype, float>>(
                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
+                  op->GetAttrMap(), program_.scope.get());
          ops_of_block_[*block_desc.get()].push_back(lrn);
        }
      }
@@ -56,7 +56,7 @@ class TestElementwiseSubOp {

  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
    // feed
-    auto scope = program_.scope;
+    auto scope = program_.scope.get();
    Variable *x1_feed_value = scope->Var("tmp_0");
    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
@@ -47,7 +47,7 @@ class TestFillConstantOp {
          std::shared_ptr<operators::FillConstantOp<Dtype, float>> op_ptr =
              std::make_shared<operators::FillConstantOp<Dtype, float>>(
                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
+                  op->GetAttrMap(), program_.scope.get());
          ops_of_block_[*block_desc.get()].push_back(op_ptr);
        }
      }
@@ -55,7 +55,7 @@ class TestFillConstantOp {
  }

  std::shared_ptr<Tensor> predict() {
-    auto scope = program_.scope;
+    auto scope = program_.scope.get();

    Variable *output = scope->Var(output_var_name);
    auto *output_tensor = output->GetMutable<LoDTensor>();

--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -103,7 +103,7 @@ int TestFcOP() {
  attrs["axis"].Set<int>(1);
  operators::OperatorBase<CPU> *op = nullptr;
  op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
-                                         scope);
+                                         scope.get());
  op->InferShape();
  op->Run();
  auto output = output_var->template Get<framework::LoDTensor>();

--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
--- a/test/operators/test_increment_op.cpp
+++ b/test/operators/test_increment_op.cpp
--- a/test/operators/test_is_empty_op.cpp
+++ b/test/operators/test_is_empty_op.cpp
--- a/test/operators/test_less_than_op.cpp
+++ b/test/operators/test_less_than_op.cpp
--- a/test/operators/test_log_op.cpp
+++ b/test/operators/test_log_op.cpp
--- a/test/operators/test_logical_and_op.cpp
+++ b/test/operators/test_logical_and_op.cpp
--- a/test/operators/test_logical_not_op.cpp
+++ b/test/operators/test_logical_not_op.cpp
--- a/test/operators/test_logical_or_op.cpp
+++ b/test/operators/test_logical_or_op.cpp
--- a/test/operators/test_logical_xor_op.cpp
+++ b/test/operators/test_logical_xor_op.cpp
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
--- a/test/operators/test_relu6_op.cpp
+++ b/test/operators/test_relu6_op.cpp
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
--- a/test/operators/test_reshape2_op.cpp
+++ b/test/operators/test_reshape2_op.cpp
--- a/test/operators/test_sequence_expand_op.cpp
+++ b/test/operators/test_sequence_expand_op.cpp
--- a/test/operators/test_sequence_pool_op.cpp
+++ b/test/operators/test_sequence_pool_op.cpp
--- a/test/operators/test_sequence_softmax_op.cpp
+++ b/test/operators/test_sequence_softmax_op.cpp
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
--- a/test/operators/test_tanh_op.cpp
+++ b/test/operators/test_tanh_op.cpp
--- a/test/operators/test_topk_op.cpp
+++ b/test/operators/test_topk_op.cpp
--- a/test/operators/test_transpose2_op.cpp
+++ b/test/operators/test_transpose2_op.cpp