Merge pull request #310 from Eclipsess/develop

fix #309

Merge pull request #310 from Eclipsess/develop
fix #309
08cff17a · eclipsycn · GitHub · 88c8a6d7 · 90932607 · 08cff17a
18 changed file
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -219,7 +219,8 @@ class Tensor {

  inline void check_memory_size() const {
    PADDLE_MOBILE_ENFORCE(
-        holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+        holder_ != nullptr,
+        "Tensor holds no memory. Call Tensor::mutable_data first.");
    PADDLE_MOBILE_ENFORCE(
        numel() * SizeOfType(type()) <= memory_size(),
        "Tensor's dims_ is out of bound. CallTensor::mutable_data "

--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -45,8 +45,6 @@ class Variable : public PaddleMobileObject {

  bool IsInitialized() const { return holder_ != nullptr; }

-  const std::string Name() { return name_; }
-
  template <typename T>
  T *GetMutable() {
    if (!IsType<T>()) {
@@ -64,8 +62,6 @@ class Variable : public PaddleMobileObject {

  std::type_index Type() const { return holder_->Type(); }

-  void SetName(const string name) { name_ = name; }
-
 private:
  struct Placeholder {
    Placeholder() = default;

--- a/src/io.cpp
+++ b/src/io.cpp
@@ -221,7 +221,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
    }
  }

-  //  originProgramDesc->Description("program: ");
+  originProgramDesc->Description("program: ");

  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
  return program;
@@ -381,7 +381,6 @@ void Executor<Dtype, P>::InitMemory() {
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
          auto tensor = var->template GetMutable<framework::LoDTensor>();
-
          tensor->template mutable_data<Ptype>();
        }
      }
@@ -392,7 +391,8 @@ void Executor<Dtype, P>::InitMemory() {
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  auto feed_tensor = g_feed_value->GetMutable<framework::LoDTensor>();
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
@@ -408,7 +408,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  DLOG << "start predict: ";

-  framework::Tensor tensor;
+  framework::LoDTensor tensor;
  auto ddim = framework::make_ddim(dims);

  auto input_ptr = tensor.mutable_data<Ptype>(ddim);

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -38,7 +38,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
  output->mutable_data<float>();
-
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
@@ -78,6 +77,7 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
  filter.Resize(filter_matrix_shape);
+  DLOG << " filter.dims() = " << filter.dims();
  framework::DDim output_matrix_shape = {
      output->dims()[1],
      output->numel() / (output->dims()[0] * output->dims()[1])};

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -207,7 +207,7 @@ class ConvParam : OpParam {

  const Tensor *Input() const { return input_; }

-  const LoDTensor *Filter() const { return filter_; }
+  const Tensor *Filter() const { return filter_; }

  Tensor *Output() const { return output_; }

@@ -222,7 +222,7 @@ class ConvParam : OpParam {
 private:
  Tensor *input_;
  Tensor *output_;
-  LoDTensor *filter_;
+  Tensor *filter_;
  vector<int> strides_;
  vector<int> paddings_;
  vector<int> dilations_;
@@ -717,10 +717,10 @@ class FushionFcParam : public OpParam {
 public:
  FushionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                 const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<Tensor>(inputs, scope);
-    input_y_ = InputYFrom<Tensor>(inputs, scope);
-    input_z_ = InputZFrom<Tensor>(inputs, scope);
-    out_ = OutFrom<Tensor>(outputs, scope);
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
+    input_z_ = InputZFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,11 +11,11 @@ ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_includ
 target_link_libraries(test-mul-op paddle-mobile)

 # gen test
-ADD_EXECUTABLE(test-elementwiseadd-op  operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
+ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
 target_link_libraries(test-elementwiseadd-op paddle-mobile)

 # gen test
-ADD_EXECUTABLE(test-concat-op  operators/test_concat_op.cpp test_helper.h  test_include.h)
+ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
 target_link_libraries(test-concat-op paddle-mobile)

 # gen test

--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "common/log.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
+#include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
 #include "operators/relu_op.h"
 #include "operators/reshape_op.h"
@@ -37,6 +38,7 @@ using paddle_mobile::framework::Program;
 using paddle_mobile::framework::Tensor;
 using paddle_mobile::framework::Variable;
 using std::string;
+using std::vector;
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
@@ -73,18 +75,34 @@ class Executor4Test : public Executor<DeviceType> {
    }
  }

-  std::shared_ptr<Tensor> predict(const Tensor &t, string input, string output,
-                                  const DDim &dDim) {
+  template <typename T = LoDTensor>
+  vector<std::shared_ptr<Tensor>> predict(const vector<Tensor> &ts,
+                                          const vector<string> &input_names,
+                                          const vector<string> &output_names,
+                                          const vector<DDim> &ddims) {
    auto scope = this->program_.scope;
-    Variable *g_feed_value = scope->Var(input);
-    auto tensor = g_feed_value->GetMutable<LoDTensor>();
-    tensor->ShareDataWith(t);
+    size_t input_size = input_names.size();
+    size_t out_size = output_names.size();

-    Variable *con_output = scope->Var(output);
-    auto *output_tensor = con_output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>(dDim);
-    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+    vector<Variable *> input_vars(input_size);
+    vector<LoDTensor *> input_tensors(input_size);
+    for (int i = 0; i < input_size; i++) {
+      input_vars[i] = scope->Var(input_names[i]);
+      input_tensors[i] = input_vars[i]->GetMutable<T>();
+      input_tensors[i]->ShareDataWith(ts[i]);
+    }
+
+    vector<Variable *> output_vars(out_size);
+    vector<LoDTensor *> output_tensors(out_size);
+    vector<std::shared_ptr<Tensor>> output_tensor_sptrs(out_size);
+
+    for (int i = 0; i < out_size; i++) {
+      output_vars[i] = scope->Var(output_names[i]);
+      output_tensors[i] = output_vars[i]->GetMutable<T>();
+      output_tensors[i]->mutable_data<float>(ddims[i]);
+      output_tensor_sptrs[i] = std::make_shared<LoDTensor>();
+      output_tensor_sptrs[i].reset(output_tensors[i]);
+    }

    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
        this->to_predict_program_->Block(0);
@@ -94,6 +112,6 @@ class Executor4Test : public Executor<DeviceType> {
      op->Run();
    }

-    return out_tensor;
+    return output_tensor_sptrs;
  }
 };
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -68,27 +68,27 @@ class TestBatchNormOp {
    // feed
    auto scope = program_.scope;
    Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *mean_feed_value = scope->Var("batch_norm_0.w_1");
-    auto tensor_mean = mean_feed_value->GetMutable<Tensor>();
+    auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
    tensor_mean->ShareDataWith(t2);

    Variable *scale_feed_value = scope->Var("batch_norm_0.w_0");
-    auto tensor_scale = scale_feed_value->GetMutable<Tensor>();
+    auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
    tensor_scale->ShareDataWith(t3);

    Variable *variance_feed_value = scope->Var("batch_norm_0.w_2");
-    auto tensor_variance = variance_feed_value->GetMutable<Tensor>();
+    auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
    tensor_variance->ShareDataWith(t4);

    Variable *bias_feed_value = scope->Var("batch_norm_0.b_0");
-    auto tensor_bias = bias_feed_value->GetMutable<Tensor>();
+    auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
    tensor_bias->ShareDataWith(t5);

    Variable *output = scope->Var("batch_norm_0.tmp_2");
-    auto *output_tensor = output->GetMutable<Tensor>();
+    auto *output_tensor = output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({4, 10, 2, 2});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -62,19 +62,19 @@ class TestBoxCoderOp {
    // feed
    auto scope = program_.scope;
    Variable *prior_box = scope->Var("concat_0.tmp_0");
-    auto tensor_x1 = prior_box->GetMutable<Tensor>();
+    auto tensor_x1 = prior_box->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *prior_box_var = scope->Var("concat_1.tmp_0");
-    auto tensor_x2 = prior_box_var->GetMutable<Tensor>();
+    auto tensor_x2 = prior_box_var->GetMutable<LoDTensor>();
    tensor_x2->ShareDataWith(t2);

    Variable *target_box = scope->Var("concat_2.tmp_0");
-    auto tensor_x3 = target_box->GetMutable<Tensor>();
+    auto tensor_x3 = target_box->GetMutable<LoDTensor>();
    tensor_x3->ShareDataWith(t3);

    Variable *boxes_output = scope->Var("box_coder_0.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<Tensor>();
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
    boxes_output_tensor->mutable_data<float>({1, 1917, 4});

    //  DLOG << typeid(output_tensor).name();

--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -12,148 +12,64 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/concat_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestConcatOp {
- public:
-  explicit TestConcatOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "concat" && op->Input("X")[0] == "conv2d_3.tmp_1") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << " axis : " << op->GetAttrMap().at("axis").Get<int>();
-
-          std::shared_ptr<operators::ConcatOp<Dtype, float>> concat =
-              std::make_shared<operators::ConcatOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(concat);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_concat(const Tensor &t1, const Tensor &t2,
-                                         const Tensor &t3, const Tensor &t4) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("conv2d_3.tmp_1");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("conv2d_5.tmp_1");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *x3_feed_value = scope->Var("conv2d_7.tmp_1");
-    auto tensor_x3 = x3_feed_value->GetMutable<Tensor>();
-    tensor_x3->ShareDataWith(t3);
-
-    Variable *x4_feed_value = scope->Var("conv2d_8.tmp_1");
-    auto tensor_x4 = x4_feed_value->GetMutable<Tensor>();
-    tensor_x4->ShareDataWith(t4);
-
-    Variable *con_output = scope->Var("concat_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({4, 100, 2, 2});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_concat(t1, t2, t3, t4, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_concat(const Tensor &t1, const Tensor &t2, const Tensor &t3,
-                      const Tensor &t4, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestConcatOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
 int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ConcatOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/googlenet"));
-
-  /// input x (4,10,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-  /// input x (4,20,2,2)
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {4, 20, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-  /// input x (4,30,2,2)
-  paddle_mobile::framework::Tensor inputx3;
-  SetupTensor<float>(&inputx3, {4, 30, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx3_ptr = inputx3.data<float>();
-  /// input x (4,40,2,2)
-  paddle_mobile::framework::Tensor inputx4;
-  SetupTensor<float>(&inputx4, {4, 40, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx4_ptr = inputx4.data<float>();
-
-  paddle_mobile::framework::TestConcatOp<paddle_mobile::CPU> testConcatOp(
-      program);
-
-  auto output_concat =
-      testConcatOp.predict_concat(inputx1, inputx2, inputx3, inputx4);
-  auto *output_concat_ptr = output_concat->data<float>();
-
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::CPU, float>>
+      executor(program, "concat");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  // 5. test one example.
  int input_n = 1;
  int input_c = 2;
  int input_h = 0;
  int input_w = 1;
-  int stride0 = inputx3.numel() / inputx3.dims()[0];
-  int stride1 = inputx3.numel() / inputx3.dims()[0] / inputx3.dims()[1];
-  int stride2 = inputx3.dims()[3];
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
  /// inputx1 (4,10,2,2),
  /// inputx2 (4,20,2,2),
  /// inputx3 (4,30,2,2),
@@ -163,10 +79,10 @@ int main() {
  int input_index =
      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
  int output_index = input_n * 100 * 2 * 2 +
-                     (input_c + inputx1.dims()[1] + inputx2.dims()[1]) * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
                     input_h * 2 + input_w;

-  DLOG << " inputx3[1,2,0,1] = " << inputx3_ptr[input_index];
-  DLOG << " output[1,12,0,1] = " << output_concat_ptr[output_index];
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
  return 0;
 }
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -12,133 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
-#include "operators/elementwise_add_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestElementwiseAddOp {
- public:
-  explicit TestElementwiseAddOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "elementwise_add" &&
-            op->Input("X")[0] == "batch_norm_2.tmp_2") {
-          DLOG << " elementwise_add attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          Attribute axis_attr = op->GetAttrMap().at("axis");
-          int axis = axis_attr.Get<int>();
-          DLOG << " Attr axis is : " << axis;
-
-          std::shared_ptr<operators::ElementwiseAddOp<Dtype, float>> add =
-              std::make_shared<operators::ElementwiseAddOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(add);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_add(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("batch_norm_2.tmp_2");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("batch_norm_0.tmp_3");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *con_output = scope->Var("elementwise_add_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({1, 3, 224, 224});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");

-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::ElementwiseAddOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "elementwise_add");

-    predict_add(t1, t2, 0);
-    return out_tensor;
-  }
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;

- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 3, 224, 224}, 0, 1);
+  input_tensors.push_back(input1);

-  void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {224}, 0, 1);
+  input_tensors.push_back(input2);

-template class TestElementwiseAddOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ElementAddOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program =
-      loader.Load(std::string("../models/"
-                              "image_classification_resnet.inference.model"));
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_2.tmp_2",
+      "batch_norm_0.tmp_3",
+  });

-  /// input x (1,3,224,224)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {1, 3, 224, 224}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-  /// input y (224,)
-  paddle_mobile::framework::Tensor inputy;
-  SetupTensor<float>(&inputy, {224}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputy_ptr = inputy.data<float>();
+  // 3. output_names
+  vector<string> output_names({"elementwise_add_0.tmp_0"});

-  paddle_mobile::framework::TestElementwiseAddOp<paddle_mobile::CPU>
-      testElementwiseAddOp(program);
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
+  out_ddims.push_back(out_ddim);

-  auto output_add = testElementwiseAddOp.predict_add(inputx, inputy);
-  auto *output_add_ptr = output_add->data<float>();
-  //            for (int j = 0; j < output_add->numel(); ++j) {
-  //                DLOG << "value of output: " << output_add_ptr[j];
-  //            }
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);

+  auto output0_data = output[0]->data<float>();
  /// output (1,3,224,224)
-  DLOG << "output memory size : " << output_add->memory_size();
-  DLOG << "output numel : " << output_add->numel();
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();

-  DLOG << inputx_ptr[226] << " + " << inputy_ptr[2] << " = "
-       << output_add_ptr[226];
-  return 0;
+  DLOG << input1_data[226] << " + " << input2_data[2] << " = "
+       << output0_data[226];
 }
--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fushion_fc_op.cpp
@@ -64,24 +64,24 @@ class TestFcOp {
    // feed
    auto scope = program_.scope;
    Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
+    auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
    tensor_x->ShareDataWith(t1);

    Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
+    auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
    tensor_y->ShareDataWith(t2);

    Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
-    auto tensor_z = z_feed_value->GetMutable<Tensor>();
+    auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
    tensor_z->ShareDataWith(t3);

    Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
+    auto *output_tensor = con_output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({3, 10});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();

-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
    out_tensor.reset(output_tensor);

    predict(t1, t2, t3, 0);
@@ -130,17 +130,17 @@ int main() {
  }

  /// input x (1,3,224,224)
-  paddle_mobile::framework::Tensor inputx;
+  paddle_mobile::framework::LoDTensor inputx;
  SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
                     static_cast<float>(1));
  auto *inputx_ptr = inputx.data<float>();
  /// input y (224,)
-  paddle_mobile::framework::Tensor inputy;
+  paddle_mobile::framework::LoDTensor inputy;
  SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
                     static_cast<float>(1.5));
  auto *inputy_ptr = inputy.data<float>();

-  paddle_mobile::framework::Tensor inputz;
+  paddle_mobile::framework::LoDTensor inputz;
  SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *inputz_ptr = inputz.data<float>();

--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -12,118 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/lrn_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestLrnOp {
- public:
-  explicit TestLrnOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "lrn" && op->Input("X")[0] == "pool2d_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << " n : " << op->GetAttrMap().at("n").Get<int>();
-          DLOG << " alpha : " << op->GetAttrMap().at("alpha").Get<float>();
-          DLOG << " beta : " << op->GetAttrMap().at("beta").Get<float>();
-          DLOG << " k : " << op->GetAttrMap().at("k").Get<float>();
-          std::shared_ptr<operators::LrnOp<Dtype, float>> lrn =
-              std::make_shared<operators::LrnOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_lrn(const Tensor &t1) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("pool2d_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *con_output = scope->Var("pool1_norm1.tmp_1");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({3, 4, 2, 2});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");

-    predict_lrn(t1, 0);
-    return out_tensor;
-  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::LrnOp<paddle_mobile::CPU, float>>
+      executor(program, "lrn");

- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;

-  void predict_lrn(const Tensor &t1, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 4, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);

-template class TestLrnOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+  });

-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run LrnOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/googlenet"));
+  // 3. output_names
+  vector<string> output_names({"pool1_norm1.tmp_1"});

-  /// input x (3,4,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {3, 4, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
+  out_ddims.push_back(out_ddim);

-  paddle_mobile::framework::TestLrnOp<paddle_mobile::CPU> testLrnOp(program);
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);

-  auto output_lrn = testLrnOp.predict_lrn(inputx1);
-  auto *output_lrn_ptr = output_lrn->data<float>();
+  auto output0_data = output[0]->data<float>();

  DLOG << " LrnOp input: ";
  for (int i = 0; i < 3; i++) {
    for (int j = 0; j < 4; j++) {
      for (int c = 0; c < 2; c++) {
        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", inputx1_ptr[i * 16 + j * 4 + c * 2 + d]);
+          DLOGF("%f ", input1_data[i * 16 + j * 4 + c * 2 + d]);
        }
        DLOGF("\n");
      }
@@ -136,7 +69,7 @@ int main() {
    for (int j = 0; j < 4; j++) {
      for (int c = 0; c < 2; c++) {
        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", output_lrn_ptr[i * 16 + j * 4 + c * 2 + d]);
+          DLOGF("%f ", output0_data[i * 16 + j * 4 + c * 2 + d]);
        }
        DLOGF("\n");
      }
@@ -144,8 +77,8 @@ int main() {
    }
    DLOGF("\n");
  }
-  DLOG << inputx1_ptr[0] << " / ((1 + 0.00002 * ( " << inputx1_ptr[0] << "^2 + "
-       << inputx1_ptr[4] << "^2 + " << inputx1_ptr[8] << "^2 ))^0.75) = ";
-  DLOG << output_lrn_ptr[0];
+  DLOG << input1_data[0] << " / ((1 + 0.00002 * ( " << input1_data[0] << "^2 + "
+       << input1_data[4] << "^2 + " << input1_data[8] << "^2 ))^0.75) = ";
+  DLOG << output0_data[0];
  return 0;
 }
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,158 +12,81 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"

-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestMulOp {
- public:
-  explicit TestMulOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "mul" && op->Input("X")[0] == "pool2d_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << "x_num_col_dims : "
-               << op->GetAttrMap().at("x_num_col_dims").Get<int>();
-          DLOG << "y_num_col_dims : "
-               << op->GetAttrMap().at("y_num_col_dims").Get<int>();
-
-          std::shared_ptr<operators::MulOp<Dtype, float>> mul =
-              std::make_shared<operators::MulOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(mul);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_mul(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("pool2d_0.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("fc_0.w_0");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *con_output = scope->Var("fc_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({3, 3});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_mul(t1, t2, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_mul(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestMulOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
 int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run MulOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program =
-      loader.Load(std::string("../../test/models/"
-                              "image_classification_resnet.inference.model"));
-
-  /// input x (3,2,1,1)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {3, 2, 1, 1}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-
-  /// input y (2,3)
-  paddle_mobile::framework::Tensor inputy;
-  SetupTensor<float>(&inputy, {2, 3}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputy_ptr = inputy.data<float>();
-
-  paddle_mobile::framework::TestMulOp<paddle_mobile::CPU> testMulOp(program);
-
-  auto output_mul = testMulOp.predict_mul(inputx, inputy);
-  auto *output_mul_ptr = output_mul->data<float>();
-
-  auto dimx_1 = inputx.numel() / inputx.dims()[0];
-  DLOG << " inputx : ";
-  for (int i = 0; i < inputx.dims()[0]; ++i) {
-    for (int j = 0; j < dimx_1; ++j) {
-      DLOGF("%f ", inputx_ptr[i * dimx_1 + j]);
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
+      executor(program, "mul");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
+  input_tensors.push_back(input2);
+
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+      "fc_0.w_0",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"fc_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  auto dim_1 = input1.numel() / input1.dims()[0];
+  DLOG << " input1 : ";
+  for (int i = 0; i < input1.dims()[0]; ++i) {
+    for (int j = 0; j < dim_1; ++j) {
+      DLOGF("%f ", input1_data[i * dim_1 + j]);
    }
    DLOGF("\n");
  }

-  auto dimy_1 = inputy.numel() / inputy.dims()[0];
-  DLOG << " inputy : ";
-  for (int i = 0; i < inputy.dims()[0]; ++i) {
-    for (int j = 0; j < dimy_1; ++j) {
-      DLOGF("%f ", inputy_ptr[i * dimx_1 + j]);
+  auto dim_2 = input2.numel() / input2.dims()[0];
+  DLOG << " input2 : ";
+  for (int i = 0; i < input2.dims()[0]; ++i) {
+    for (int j = 0; j < dim_2; ++j) {
+      DLOGF("%f ", input2_data[i * dim_2 + j]);
    }
    DLOGF("\n");
  }

-  auto dim_output_1 = output_mul->numel() / output_mul->dims()[0];
+  auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
  DLOG << " output : ";
-  for (int i = 0; i < output_mul->dims()[0]; ++i) {
-    for (int j = 0; j < dim_output_1; ++j) {
-      DLOGF("%f ", output_mul_ptr[i * dimy_1 + j]);
+  for (int i = 0; i < output[0]->dims()[0]; ++i) {
+    for (int j = 0; j < dim_output0; ++j) {
+      DLOGF("%f ", output0_data[i * dim_2 + j]);
    }
    DLOGF("\n");
  }

  /// output (3,3)
-  DLOG << "output memory size : " << output_mul->memory_size();
-  DLOG << "output numel : " << output_mul->numel();
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();

-  DLOG << inputx_ptr[0] << " x " << inputy_ptr[0] << " + " << inputx_ptr[1]
-       << " x " << inputy_ptr[0 + 3] << " = " << output_mul_ptr[0];
+  DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
+       << " x " << input2_data[0 + 3] << " = " << output0_data[0];
  return 0;
 }
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -77,15 +77,15 @@ class TestMultiClassNMSOp {
    // feed
    auto scope = program_.scope;
    Variable *x1_feed_value = scope->Var("box_coder_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *x2_feed_value = scope->Var("transpose_12.tmp_0");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
    tensor_x2->ShareDataWith(t2);

    Variable *output = scope->Var("detection_output_0.tmp_0");
-    auto *output_tensor = output->GetMutable<Tensor>();
+    auto *output_tensor = output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({1917, 6});

    //  DLOG << typeid(output_tensor).name();

--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -72,19 +72,19 @@ class TestPriorBoxOp {
    // feed
    auto scope = program_.scope;
    Variable *x1_feed_value = scope->Var("image");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

    Variable *x2_feed_value = scope->Var("batch_norm_26.tmp_3");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
    tensor_x2->ShareDataWith(t2);

    Variable *boxes_output = scope->Var("prior_box_1.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<Tensor>();
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
    boxes_output_tensor->mutable_data<float>({10, 10, 6, 4});

    Variable *variances_output = scope->Var("prior_box_1.tmp_1");
-    auto *variances_output_tesnor = variances_output->GetMutable<Tensor>();
+    auto *variances_output_tesnor = variances_output->GetMutable<LoDTensor>();
    variances_output_tesnor->mutable_data<float>({10, 10, 6, 4});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();

--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -14,12 +14,11 @@ limitations under the License. */

 #include "../executor_for_test.h"
 #include "../test_include.h"
+#include "operators/relu_op.h"

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_mobilenet_ssd);
-
+  auto program = loader.Load(g_resnet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

@@ -27,17 +26,33 @@ int main() {
                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
      executor(program, "relu");

-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 2, 3, 4}, static_cast<float>(-1),
-                     static_cast<float>(1));
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });

+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
-  auto output = executor.predict(input, "batch_norm_0.tmp_2",
-                                 "batch_norm_0.tmp_3", out_ddim);
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();

-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
  }
  return 0;
 }
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -29,7 +29,8 @@ static const std::string g_resnet =
    "../models/image_classification_resnet.inference.model";
 static const std::string g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
-
+using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Tensor;
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
                 paddle_mobile::framework::DDim dims, T lower, T upper) {
@@ -43,6 +44,12 @@ void SetupTensor(paddle_mobile::framework::Tensor *input,
  }
 }

+template <typename T>
+T *CreateInput(Tensor *input, DDim dims, T low, T up) {
+  SetupTensor<T>(input, dims, static_cast<float>(low), static_cast<float>(up));
+  return input->data<T>();
+}
+
 template <typename T>
 void GetInput(const std::string &input_name, std::vector<T> *input,
              const std::vector<int64_t> &dims) {