diff --git a/src/operators/elementwise_mul_op.h b/src/operators/elementwise_mul_op.h
index 04454dc5a5cb9c1c167f6d496827483a58dbfaf1..991b03a486d65c720b88b80a1aece417b9919d3d 100644
--- a/src/operators/elementwise_mul_op.h
+++ b/src/operators/elementwise_mul_op.h
@@ -48,13 +48,4 @@ class ElementwiseMulOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(elementwise_mul);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(elementwise_mul);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/sum_op.h b/src/operators/sum_op.h
index 4ae960d084b3bbb1952251e9c07f9fca0beab1f3..aad8e8322b60d0e931215c9d48d97862f9b14107 100644
--- a/src/operators/sum_op.h
+++ b/src/operators/sum_op.h
@@ -46,14 +46,4 @@ class SumOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(sum);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(sum);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(sum);
-#endif
-
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a95748b78c6b3d758cbc8381ac8f6815a6b2c2b6..ad3b2e5fe95abbec76380af3addca7b769ba3e34 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -212,6 +212,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-fc-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-sum-op paddle-mobile)
+
     # test quantize op
     ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-quantize-op paddle-mobile)
diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e51d1cff5e99c5d9c444db046e78eee6a03f9243
--- /dev/null
+++ b/test/operators/test_sum_op.cpp
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/sum_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestSumOp {
+ public:
+  explicit TestSumOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "sum" && op->Input("X")[0] == "fc_2.tmp_0") {
+          DLOG << " sum attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+
+          std::shared_ptr<operators::SumOp<Dtype, float>> lrn =
+              std::make_shared<operators::SumOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(lrn);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("fc_2.tmp_0");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *x2_feed_value = scope->Var("fc_2.tmp_1");
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
+    tensor_x2->ShareDataWith(t2);
+
+    Variable *output = scope->Var("fc_2.tmp_2");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({2, 96});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict_bn(t1, t2, 0);
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestSumOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Sum Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_eng) + "/model",
+                             std::string(g_eng) + "/params");
+
+  /// input x (4,10,2,2)
+  paddle_mobile::framework::Tensor inputx1;
+  SetupTensor<float>(&inputx1, {2, 96}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx1_ptr = inputx1.data<float>();
+
+  paddle_mobile::framework::Tensor inputx2;
+  SetupTensor<float>(&inputx2, {2, 96}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx2_ptr = inputx2.data<float>();
+
+  paddle_mobile::framework::TestSumOp<paddle_mobile::CPU> testSumOp(program);
+
+  auto output_sum = testSumOp.predict_bn(inputx1, inputx2);
+  auto *output_sum_ptr = output_sum->data<float>();
+
+  DLOG << "input1 44: " << inputx1_ptr[44];
+  DLOG << "input2 44: " << inputx2_ptr[44];
+  DLOG << "out 44 :" << output_sum_ptr[44];
+
+  return 0;
+}
diff --git a/test/test_helper.h b/test/test_helper.h
index ecbc251a815e343f75b1247ffc430e9c52d6abfd..03ee27d71d58eb5c727172a8112aeedfde244d0f 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 static const char *g_ocr = "../models/ocr";
 static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const char *g_genet_combine = "../models/enet";
+static const char *g_eng = "../models/eng_20conv_1_9_fc";
 static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
 static const char *g_mobilenet_combined = "../models/mobilenet_combine";
 static const char *g_googlenetv1_combined = "../models/googlenetv1_combine";
@@ -51,6 +52,7 @@ static const char *g_test_image_1x3x224x224_banana =
 static const char *g_test_image_desktop_1_3_416_416_nchw_float =
     "../images/in_put_1_3_416_416_2";
 static const char *g_hand = "../images/hand_image";
+static const char *g_moto = "../images/moto_300x300_float";
 static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
 static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
 static const char *g_img = "../images/img.bin";
diff --git a/tools/op.cmake b/tools/op.cmake
index 898f66a634d70a5def7c7ce328a7a291d9b55c70..5d5567a524ae69bfb4668ff6078621eb4cb5920d 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -220,6 +220,8 @@ if(NOT FOUND_MATCH)
   set(SPLIT_OP ON)
   set(FLATTEN_OP ON)
   set(SHAPE_OP ON)
+  set(ELEMENTWISEMUL_OP ON)
+  set(SUM_OP ON)
 endif()
 
   # option(BATCHNORM_OP "" ON)
@@ -388,3 +390,11 @@ endif()
 if (SHAPE_OP)
   add_definitions(-DSHAPE_OP)
 endif()
+
+if (ELEMENTWISEMUL_OP)
+  add_definitions(-DELEMENTWISEMUL_OP)
+endif()
+if (SUM_OP)
+  add_definitions(-DSUM_OP)
+endif()
+