diff --git a/src/common/enforce.h b/src/common/enforce.h
index 092afe0b794a91fd2fa5bc82095943f45872a333..1bacfb88d328c85de9b284249c8d9d58e7fc8e5e 100644
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -63,7 +63,12 @@ struct PaddleMobileException : public std::exception {
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)
 
-#define PADDLE_MOBILE_ENFORCE(stat, ...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...) \
+  {                                      \
+    if (stat) {                          \
+    } else {                             \
+    }                                    \
+  }
 
 #endif
 
diff --git a/src/operators/gru_op.cpp b/src/operators/gru_op.cpp
index cb1b0b335b2f2eebcb6863c3f1ff8cff2c2fb74c..db0936d00c1dcb0e90e8664660f92ed004c258b7 100644
--- a/src/operators/gru_op.cpp
+++ b/src/operators/gru_op.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef GRU_OP
 
 #include "operators/gru_op.h"
-#include <iostream>
 #include <vector>
 #include "common/enforce.h"
 
diff --git a/src/operators/kernel/arm/cast_kernel.cpp b/src/operators/kernel/arm/cast_kernel.cpp
index 8c4028b0d5b1a88c9962c029afb0b958ad9ee34e..ec7e133d2b8c1f88601ff6db2ad15b9e6d8040f7 100644
--- a/src/operators/kernel/arm/cast_kernel.cpp
+++ b/src/operators/kernel/arm/cast_kernel.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef CAST_OP
 
 #include <algorithm>
-#include <iostream>
 #include <vector>
 #include "framework/data_type.h"
 #include "operators/kernel/kernels.h"
diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
index b6288380a04c71b3d6467f7f6648db046ae9acc9..19561d6b84e82b96463abc553042aa56ae38036d 100644
--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -26,18 +26,12 @@ namespace paddle_mobile {
 namespace operators {
 
 template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename P>
-void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
+inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
+  const framework::Tensor *input_x = param.InputX();
+  const framework::Tensor *input_y = param.InputY();
+  framework::Tensor *Out = param.Out();
   int axis = param.Axis();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
   const auto &x_dims = input_x->dims();
   const auto &y_dims = input_y->dims();
   /// axis = -1 represent the last dimensions.
@@ -57,18 +51,20 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
   const float *bias_data = input_y->data<float>();
   const float *input_data = input_x->data<float>();
   float *output_data = Out->mutable_data<float>();
+
+  #pragma omp parallel for collapse(2)
   for (int i = 0; i < batch; ++i) {
-    #pragma omp parallel for
     for (int j = 0; j < channels; ++j) {
       size_t offset = (i * channels + j) * elementwise_num;
       const float *input = input_data + offset;
-      const float *bias = bias_data + j;
+      const float bias = bias_data[j];
       float *output = output_data + offset;
-
+      int remain = elementwise_num;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
       int loop = elementwise_num >> 0x4;
-      int remain = elementwise_num & 0xF;
+      remain = elementwise_num & 0xF;
       for (int k = 0; k < loop; ++k) {
-        float32x4_t rb = vdupq_n_f32(*bias);
+        float32x4_t rb = vdupq_n_f32(bias);
         float32x4_t r0 = vld1q_f32(input);
         float32x4_t r1 = vld1q_f32(input + 4);
         float32x4_t r2 = vld1q_f32(input + 8);
@@ -84,15 +80,12 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
         input += 16;
         output += 16;
       }
+#endif
       for (int k = 0; k < remain; ++k) {
-        output[k] = input[k] + *bias;
+        output[k] = input[k] + bias;
       }
     }
   }
-#else
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
-#endif
 }
 
 template class ElementwiseAddKernel<CPU, float>;
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 2b87d0bb4e1eda086b37bb12a05797fe0f659506..24e74ffeaca6c0b27f04109721ffdab61d7fbaee 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 int main(int argc, char* argv[]) {
   if (argc < 2) {
     std::cout << "Usage: ./test_benchmark feed_shape [thread_num] [use_fuse]\n"
-              << "feed_shape: input tensor shape, such as 1,3,224,224.\n"
+              << "feed_shape: input tensor shape, such as 3,224,224.\n"
               << "thread_num: optional int, threads count, default is 1.\n"
               << "use_fuse: optional bool, default is 0.\n";
     return 1;
@@ -52,7 +52,7 @@ int main(int argc, char* argv[]) {
       sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]);
     }
     std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
-              << dims[2] << ", " << dims[3] << "]\n";
+              << dims[2] << ", " << dims[3] << "]" << std::endl;
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
     // warmup
     for (int i = 0; i < 10; ++i) {