diff --git a/mace/core/proto_utils.cc b/mace/core/proto_utils.cc
index 78a529597401807ed4821c2087a2c02c398b6006..7d9c437ed5a50b2b26230daeee799503fda9af42 100644
--- a/mace/core/proto_utils.cc
+++ b/mace/core/proto_utils.cc
@@ -314,6 +314,8 @@ const Argument& GetArgument(const OperatorDef& def, const string& name) {
   }
   MACE_CHECK(false, "Argument named ", name, "does not exist in operator ",
              ProtoDebugString(def));
+  // should not reach here, just make compiler happy
+  return std::move(Argument());
 }
 
 bool GetFlagArgument(const OperatorDef& def,
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index ea70ff22add88aec5c0ce964fb53111c6f8fc2a8..37d8936a33d4886204539331e15d40009a4ccc0c 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -17,8 +17,8 @@ cc_library(
     deps = [
         "//mace/core:core",
     ],
-    copts = ['-std=c++11'],
-    linkopts = ["-fopenmp"] + if_android(["-lm"]),
+    copts = ['-std=c++11', "-fopenmp",],
+    linkopts = if_android(["-lm"]),
 )
 
 cc_test(
diff --git a/mace/kernels/neon/conv_2d_neon_1x1.cc b/mace/kernels/neon/conv_2d_neon_1x1.cc
index 922b32651b044dd8701b8ddd70e8589ef1cba05e..c1010bcd2145fe633a3f02789ec0235cf0574203 100644
--- a/mace/kernels/neon/conv_2d_neon_1x1.cc
+++ b/mace/kernels/neon/conv_2d_neon_1x1.cc
@@ -35,10 +35,10 @@ void Conv2dNeonK1x1S1(const float *input,  // NCHW
   const index_t loop_remaining = total_pixels & 7;
 
   // benchmark omp collapsed(2)
+#pragma omp parallel for collapse(2)
   for (index_t n = 0; n < batch; ++n) {
-    const float *filter_ptr = filter;
-#pragma omp parallel for
     for (index_t c = 0; c < channels; ++c) {
+      const float *filter_ptr = filter;
       // TODO Will GCC opt these out?
       float *channel_output_start =
           output + n * channels * height * width + c * height * width;
diff --git a/mace/kernels/neon/conv_2d_neon_3x3.cc b/mace/kernels/neon/conv_2d_neon_3x3.cc
index ac5636a87a78f387e821bd46cb9c42692227b5d0..93ff3c91d9ce511b75de45ee13bc38a3fb953282 100644
--- a/mace/kernels/neon/conv_2d_neon_3x3.cc
+++ b/mace/kernels/neon/conv_2d_neon_3x3.cc
@@ -8,37 +8,6 @@
 namespace mace {
 namespace kernels {
 
-#define KERNEL_HEAD_CODE                                                                    \
-  int output_batch    = output_shape[0];                                                    \
-  int output_channels = output_shape[1];                                                    \
-  int output_height   = output_shape[2];                                                    \
-  int output_width    = output_shape[3];                                                    \
-  int input_batch    = input_shape[0];                                                      \
-  int input_channels = input_shape[1];                                                      \
-  int input_height   = input_shape[2];                                                      \
-  int input_width    = input_shape[3];                                                      \
-  int multiplier     = filter_shape == nullptr ? 0 : (filter_shape[0] / input_channels);    \
-  int filter_in_channels = filter_shape == nullptr ? input_channels : filter_shape[1];      \
-  for (int b = 0; b < output_batch; ++b) {                                                  \
-    float *output_ptr_base = output + b * output_channels * output_height * output_width;   \
-    for (int oc = 0; oc < output_channels; ++oc) {                                          \
-        const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize;           \
-        const float *input_ptr = input + b * input_channels * input_height * input_width;   \
-        if (filter_shape != nullptr) {                                                      \
-          input_ptr += (oc / multiplier) * input_height * input_width;                      \
-        }                                                                                   \
-        float *output_ptr = output_ptr_base + oc * output_height * output_width;            \
-        std::fill(output_ptr, output_ptr + output_height * output_width, bias ? bias[oc] : 0); \
-        for (int ic = 0; ic < filter_in_channels; ++ic) {                                   \
-          float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr), vld1q_f32(filter_ptr+3), vld1q_f32(filter_ptr+6)};
-
-#define KERNEL_TAIL_CODE                         \
-        filter_ptr += kFilterSize;               \
-        input_ptr += input_height * input_width; \
-      }                                          \
-    }                                            \
-  }
-
 static const int kRegisterSize = 4;
 static const int kFilterSize = 9;
 
@@ -52,7 +21,29 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
 
   int height_count = (output_shape[2] >> 1) << 1;
 
-  KERNEL_HEAD_CODE
+  int output_batch    = output_shape[0];
+  int output_channels = output_shape[1];
+  int output_height   = output_shape[2];
+  int output_width    = output_shape[3];
+  int input_batch    = input_shape[0];
+  int input_channels = input_shape[1];
+  int input_height   = input_shape[2];
+  int input_width    = input_shape[3];
+  int multiplier     = filter_shape == nullptr ? 0 : (filter_shape[0] / input_channels);
+  int filter_in_channels = filter_shape == nullptr ? input_channels : filter_shape[1];
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < output_batch; ++b) {
+    for (int oc = 0; oc < output_channels; ++oc) {
+      float *output_ptr_base = output + b * output_channels * output_height * output_width;
+      const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize;
+      const float *input_ptr = input + b * input_channels * input_height * input_width;
+      if (filter_shape != nullptr) {
+        input_ptr += (oc / multiplier) * input_height * input_width;
+      }
+      float *output_ptr = output_ptr_base + oc * output_height * output_width;
+      std::fill(output_ptr, output_ptr + output_height * output_width, bias ? bias[oc] : 0);
+      for (int ic = 0; ic < filter_in_channels; ++ic) {
+        float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr), vld1q_f32(filter_ptr+3), vld1q_f32(filter_ptr+6)};
 
         const float *row_ptr_v[kRegisterSize] = {
             input_ptr, input_ptr + input_width,
@@ -212,7 +203,11 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
           }
         }
 
-  KERNEL_TAIL_CODE
+        filter_ptr += kFilterSize;
+        input_ptr += input_height * input_width;
+      }
+    }
+  }
 }
 
 void Conv2dNeonK3x3S2(const float *input, // NCHW
@@ -224,7 +219,30 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
                       const index_t *output_shape) {
   int tail_step = 2 * (input_shape[3] - output_shape[3]);
 
-  KERNEL_HEAD_CODE
+  int output_batch    = output_shape[0];
+  int output_channels = output_shape[1];
+  int output_height   = output_shape[2];
+  int output_width    = output_shape[3];
+  int input_batch    = input_shape[0];
+  int input_channels = input_shape[1];
+  int input_height   = input_shape[2];
+  int input_width    = input_shape[3];
+  int multiplier     = filter_shape == nullptr ? 0 : (filter_shape[0] / input_channels);
+  int filter_in_channels = filter_shape == nullptr ? input_channels : filter_shape[1];
+
+#pragma omp parallel for collapse(2)
+  for (int b = 0; b < output_batch; ++b) {
+    for (int oc = 0; oc < output_channels; ++oc) {
+      float *output_ptr_base = output + b * output_channels * output_height * output_width;
+      const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize;
+      const float *input_ptr = input + b * input_channels * input_height * input_width;
+      if (filter_shape != nullptr) {
+        input_ptr += (oc / multiplier) * input_height * input_width;
+      }
+      float *output_ptr = output_ptr_base + oc * output_height * output_width;
+      std::fill(output_ptr, output_ptr + output_height * output_width, bias ? bias[oc] : 0);
+      for (int ic = 0; ic < filter_in_channels; ++ic) {
+        float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr), vld1q_f32(filter_ptr+3), vld1q_f32(filter_ptr+6)};
 
         const float *row_ptr_v[3] = {
             input_ptr, input_ptr + input_width, input_ptr + 2 * input_width
@@ -291,10 +309,11 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
           }
         }
 
-  KERNEL_TAIL_CODE
+        filter_ptr += kFilterSize;
+        input_ptr += input_height * input_width;
+      }
+    }
+  }
 }
-#undef KERNEL_HEAD_CODE
-#undef KERNEL_TAIL_CODE
-
 } //  namespace kernels
 } //  namespace mace
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 02d29ce03d94ae66ede3660d01fa32a19871051a..30376fa01c1ccb9aeb175cd483e5599511f72855 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -34,7 +34,7 @@ cc_library(
         ["*.h"],
         exclude = ["ops_test_util.h"],
     ),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-fopenmp",],
     deps = [
         "//mace/core",
         "//mace/kernels",
@@ -50,7 +50,7 @@ cc_test(
         ["*_test.cc"],
     ),
     copts = ["-std=c++11"],
-    linkopts = if_android(["-ldl"]),
+    linkopts = ["-fopenmp",] + if_android(["-ldl"]),
     linkstatic = 1,
     deps = [
         ":ops",
@@ -64,7 +64,7 @@ cc_test(
     testonly = 1,
     srcs = glob(["*_benchmark.cc"]),
     copts = ["-std=c++11"],
-    linkopts = if_android(["-ldl"]),
+    linkopts = ["-fopenmp",] + if_android(["-ldl"]),
     linkstatic = 1,
     deps = [
         ":ops",