diff --git a/mace/BUILD b/mace/BUILD
index 98a9ab0510c4a1bf9a659402e071aec1f6963102..bebe0e9db98a475a98888e7f2689f326e21d9f3f 100644
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -51,3 +51,11 @@ config_setting(
     },
     visibility = ["//visibility:public"],
 )
+
+config_setting(
+    name = "openmp_enabled",
+    define_values = {
+        "openmp": "true",
+    },
+    visibility = ["//visibility:public"],
+)
diff --git a/mace/core/BUILD b/mace/core/BUILD
index 81b731d7a5510635a568317b1f473453121da269..afaed252b2fd837c8c409cfbbbc31ddda7a494c7 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -42,9 +42,8 @@ cc_library(
         "runtime/opencl/*.h",
         "runtime/hexagon/*.h",
     ]),
-    linkopts = if_android([
+    linkopts = ["-ldl",] + if_android([
         "-pie",
-        "-ldl",
         "-lm",
     ]),
     deps = [
diff --git a/mace/core/operator.h b/mace/core/operator.h
index d673ca81b3a992a7ecaba38744d2c79fe9cf80cb..185bbc76664f48d9b622f7fd0bae6c5b7f8760aa 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -94,7 +94,6 @@ class Operator : public OperatorBase {
 
     for (const string &output_str : operator_def.output()) {
       if (ws->HasTensor(output_str)) {
-        Tensor *found_tensor = ws->GetTensor(output_str);
         outputs_.push_back(ws->GetTensor(output_str));
       } else {
         outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
index b7d2bfdab81f681b84c789d4653737e3b4a0a2e5..eab3a902d9ba3b8fbaa8d4e43ffa0cd72d2415ab 100644
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -158,7 +158,6 @@ bool HexagonControlWrapper::TeardownGraph() {
 
 void HexagonControlWrapper::PrintLog() {
   char *buf;
-  unsigned char *p;
   if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
   hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char*>(buf), PRINT_BUFSIZE);
   LOG(INFO) << string(buf);
@@ -168,7 +167,6 @@ void HexagonControlWrapper::PrintLog() {
 void HexagonControlWrapper::PrintGraph() {
   LOG(INFO) << "Print Graph";
   char *buf;
-  unsigned char *p;
   if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
   hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char*>(buf), PRINT_BUFSIZE);
   LOG(INFO) << string(buf);
diff --git a/mace/examples/BUILD b/mace/examples/BUILD
index 233b59f172ecee67cb7ba943f1692a8f84786292..ff47e1d98fcca18a96cee0c9a69bd5a7101554db 100644
--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -1,12 +1,12 @@
 # Examples
-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
 
 cc_binary(
     name = "helloworld",
     srcs = [
         "helloworld.cc",
     ],
-    linkopts = if_neon_enabled(["-fopenmp"]),
+    linkopts = if_openmp_enabled(["-fopenmp"]),
     deps = [
         "//mace/core",
         "//mace/ops",
@@ -17,7 +17,7 @@ cc_test(
     name = "benchmark_example",
     testonly = 1,
     srcs = ["benchmark_example.cc"],
-    linkopts = if_neon_enabled(["-fopenmp"]),
+    linkopts = if_openmp_enabled(["-fopenmp"]),
     linkstatic = 1,
     deps = [
         "//mace/core",
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index a4646d90819205079387682ef181d556feb1172a..ba1b601f267bcdfd3fa342ad4fa8bcc2d4f02d69 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -7,7 +7,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
 
 cc_library(
     name = "kernels",
@@ -23,7 +23,7 @@ cc_library(
     ]) + if_neon_enabled(glob([
         "neon/*.h",
     ])),
-    copts = if_neon_enabled(["-fopenmp"]),
+    copts = if_openmp_enabled(["-fopenmp"]),
     linkopts = if_android(["-lm"]),
     deps = [
         "//mace/core",
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index bd8fc7e98b91d3a9668b76b430943570df20be12..6f16bf6fc7aaa0c1c4b7400fe62458b5bd643574 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -86,19 +86,18 @@ struct BatchNormFunctor : BatchNormFunctorBase {
       }
     }
 
-    index_t pos = 0;
 
-#pragma omp parallel for
+#pragma omp parallel for collapse(4)
     for (index_t n = 0; n < batch; ++n) {
       for (index_t h = 0; h < height; ++h) {
         for (index_t w = 0; w < width; ++w) {
           for (index_t c = 0; c < channels; ++c) {
+            index_t pos = (((n * height) + h) * width + w) * channels + c;
             if (folded_constant_) {
               output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
             } else {
               output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
             }
-            ++pos;
           }
         }
       }
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index 5b87026debf2f760e3d82c09251b99ca1426aa3b..1e7f6dc85e694b75f159fc033ea4abc4d34f7806 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -33,14 +33,13 @@ struct BiasAddFunctor {
     T *output_ptr = output->mutable_data<T>();
 
 
-    index_t pos = 0;
-#pragma omp parallel for
+#pragma omp parallel for collapse(4)
     for (index_t n = 0; n < batch; ++n) {
       for (index_t h = 0; h < height; ++h) {
         for (index_t w = 0; w < width; ++w) {
           for (index_t c = 0; c < channels; ++c) {
+            index_t pos = (((n * height) + h) * width + w) * channels + c;
             output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
-            ++pos;
           }
         }
       }
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 01c55434c85f1c5303cd555c3703861a7bf84e65..cc331a17c223eb252f11cc3555b79990ee8894ee 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -94,8 +94,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
     index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
     index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
 
-    index_t kernel_size = input_channels * kernel_h * kernel_w;
-
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard filter_mapper(filter);
     Tensor::MappingGuard bias_mapper(bias);
@@ -105,13 +103,15 @@ struct Conv2dFunctor : Conv2dFunctorBase {
     auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
     auto output_data = output->mutable_data<T>();
 
+#pragma omp parallel for collapse(4)
     for (int n = 0; n < batch; ++n) {
       for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
           for (int c = 0; c < channels; ++c) {
+            const int out_idx = ((n * height + h) * width + w) * channels + c;
             T bias_channel = 0.0f;
             if (bias) bias_channel = bias_data[c];
-            *output_data = bias_channel;
+            output_data[out_idx] = bias_channel;
             T sum = 0.0f;
             const T *filter_ptr = filter_data + c;
             for (int kh = 0; kh < kernel_h; ++kh) {
@@ -125,8 +125,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                                    inw >= padded_w_start && inw < padded_w_stop,
                                "Out of range read from input: ", inh, ", ",
                                inw);
-                    // else padding with 0:
-                    // sum += 0;
                   } else {
                     index_t input_offset =
                         n * input_height * input_width * input_channels +
@@ -138,13 +136,11 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                 }
               }
             }
-            *output_data += sum;
-            output_data++;
+            output_data[out_idx] += sum;
           }
         }
       }
     }
-    output_data = output->mutable_data<T>();
     DoActivation(output_data, output_data, output->NumElements(), activation_,
                  relux_max_limit_, prelu_alpha_);
   }
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index caff18938e3ff4a4fefd65964cb36fce7fe457d7..395797240c5dd589438f9fe5dc7c6de047d2290d 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -105,8 +105,6 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
     index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
     index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
 
-    const index_t kernel_size = kernel_h * kernel_w;
-
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard filter_mapper(filter);
     Tensor::MappingGuard bias_mapper(bias);
@@ -116,7 +114,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
     const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
     for (int n = 0; n < batch; ++n) {
       for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 8d0b4d1a4cebca514ef70a8ebdfc3fdd20e7f525..9403c905b4bfb467322de9d11d83fa0bdef0ae3a 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -62,17 +62,17 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
     const int in_idx = mul24(in_ch_blk, in_width);
     int filter_x_part0 = in_ch_blk << 2;
+    int in_hb_idx = height_idx;
     for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
-      // TODO (heliangliang) optimize out these muls
-      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
-      in_hb_value = select(in_hb_value + batch_idx,
-                           -1,
-                           (in_hb_value < 0 || in_hb_value >= in_height));
+      int in_hb_value = select(in_hb_idx + batch_idx,
+                               -1,
+                               (in_hb_idx < 0 || in_hb_idx >= in_height));
       int filter_x_part1 = 0;
+      int in_width_idx = 0;
       for (short width_idx = 0; width_idx < 3; ++width_idx) {
         int in_width_value;
 #define READ_INPUT(i)                                                                \
-        in_width_value = in_width##i + mul24(width_idx, dilation_w);                 \
+        in_width_value = in_width##i + in_width_idx;                                 \
         in_width_value = select(in_idx + in_width_value,                             \
                                 -1,                                                  \
                                 (in_width_value < 0 || in_width_value >= in_width)); \
@@ -120,8 +120,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
         out4 = mad(in4.w, weights3, out4);
 
         filter_x_part1 += rounded_in_ch;
+        in_width_idx += dilation_w;
       }
       filter_x_part0 += rounded_in_ch_x_3;
+      in_hb_idx += dilation_h;
     }
   }
 
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 451a19d068121bd55c7af4366d76a8afedfdecb6..cc9cfee9f86e8d35eedc39c66aed20cb9987fed5 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -191,6 +191,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
         {1, kwg_size / 32, 32, 1},
         {1, kwg_size / 64, 64, 1},
         {1, kwg_size / 128, 128, 1},
+        {4, kwg_size / 16, 4, 1},
+        {4, kwg_size / 28, 7, 1},
+        {4, kwg_size / 32, 8, 1},
+        {4, kwg_size / 56, 14, 1},
         {3, 15, 9, 1},
         {7, 15, 9, 1},
         {9, 7, 15, 1},
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 99d2363f8b003a3802c67b7177d286fd19ca1643..d9a28c547244304051aff0097440dc8797dd7cb0 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -76,7 +76,6 @@ struct PoolingFunctor : PoolingFunctorBase {
     index_t height = output_shape[1];
     index_t width = output_shape[2];
     index_t channels = output_shape[3];
-    index_t out_image_size = height * width;
 
     index_t input_height = input_shape[1];
     index_t input_width = input_shape[2];
@@ -97,11 +96,12 @@ struct PoolingFunctor : PoolingFunctorBase {
     int padded_w_start = 0 - paddings[1] / 2;
 
     if (pooling_type_ == MAX) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
       for (int b = 0; b < batch; ++b) {
         for (int h = 0; h < height; ++h) {
           for (int w = 0; w < width; ++w) {
             for (int c = 0; c < channels; ++c) {
+              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
               index_t in_offset = b * in_image_size * input_channels + c;
               T res = std::numeric_limits<T>::lowest();
               for (int kh = 0; kh < kernel_h; ++kh) {
@@ -115,18 +115,18 @@ struct PoolingFunctor : PoolingFunctorBase {
                   }
                 }
               }
-              *output = res;
-              output++;
+              output[out_offset] = res;
             }
           }
         }
       }
     } else if (pooling_type_ == AVG) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
       for (int b = 0; b < batch; ++b) {
         for (int h = 0; h < height; ++h) {
           for (int w = 0; w < width; ++w) {
             for (int c = 0; c < channels; ++c) {
+              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
               index_t in_offset = b * in_image_size * input_channels + c;
               T sum = 0;
               int block_size = 0;
@@ -142,8 +142,7 @@ struct PoolingFunctor : PoolingFunctorBase {
                   }
                 }
               }
-              *output = sum / block_size;
-              output++;
+              output[out_offset] = sum / block_size;
             }
           }
         }
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index b29514a29e9ec4e1e37df3e643878ca914529b74..2e5bc495b0969bf1ef2a70afefaa2f02a616c097 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -29,21 +29,20 @@ struct SoftmaxFunctor {
     const index_t num_classes = logits_shape.back();
 #pragma omp parallel for
     for (index_t i = 0; i < batch_size; ++i) {
-      T max_value = *logits_ptr;
+      const index_t pos = i * num_classes;
+      T max_value = logits_ptr[pos];
       for (index_t c = 1; c < num_classes; ++c) {
-        max_value = std::max(max_value, logits_ptr[c]);
+        max_value = std::max(max_value, logits_ptr[pos + c]);
       }
       // TODO: check overflow?
       T sum = 0;
       std::vector<T> exp_data(num_classes);
       for (index_t c = 0; c < num_classes; ++c) {
-        exp_data[c] = ::exp((*logits_ptr - max_value));
+        exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
         sum += exp_data[c];
-        logits_ptr++;
       }
       for (index_t c = 0; c < num_classes; ++c) {
-        *output_ptr = exp_data[c] / sum;
-        output_ptr++;
+        output_ptr[pos + c] = exp_data[c] / sum;
       }
     }
   }
diff --git a/mace/mace.bzl b/mace/mace.bzl
index e9002885b80e153db7e1f2a529549434fd669a87..3db0ff5c4371bec94f6057c9710f458229e1952a 100644
--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -53,3 +53,9 @@ def if_not_hexagon_enabled(a):
       "//mace:hexagon_enabled": [],
       "//conditions:default": a,
   })
+
+def if_openmp_enabled(a):
+  return select({
+      "//mace:openmp_enabled": a,
+      "//conditions:default": [],
+  })
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 442441e0ad1df535d97727d131ca71f7bdfbcc7d..dbe6e5e223a25344ecf7d5e3ad807cade17eb38d 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -7,7 +7,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
 
 cc_library(
     name = "test",
@@ -34,7 +34,7 @@ cc_library(
         ["*.h"],
         exclude = ["ops_test_util.h"],
     ),
-    copts = if_neon_enabled(["-DMACE_ENABLE_NEON"]),
+    copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
     deps = [
         "//mace/kernels",
     ],
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 73e1c6db319a150faf4b9a8f9bd87bba403a8aa5..ce682f06080813b52c6c883b25cc899aa40ca174 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -52,7 +52,7 @@ inline std::string ObfuscateString(const std::string &src,
                                    const std::string &lookup_table) {
   std::string dest;
   dest.resize(src.size());
-  for (int i = 0; i < src.size(); i++) {
+  for (size_t i = 0; i < src.size(); i++) {
     dest[i] = src[i] ^ lookup_table[i % lookup_table.size()];
   }
   return std::move(dest);
@@ -73,7 +73,7 @@ inline std::string ObfuscateSymbol(const std::string &src) {
   dest[0] = src[0]; // avoid invalid symbol which starts from 0-9
   const std::string encode_dict =
     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
-  for (int i = 1; i < src.size(); i++) {
+  for (size_t i = 1; i < src.size(); i++) {
     char ch = src[i];
     int idx;
     if (ch >= '0' && ch <= '9') {
diff --git a/tools/bazel-adb-run.sh b/tools/bazel-adb-run.sh
index 9c9033b1aebc7e4198432eab8bfd2b7c14954f5a..6e1e68f2754254ca0e2b1015670231c546cc1875 100755
--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
    --copt="-DMACE_DISABLE_NO_TUNING_WARNING" \
    --copt="-Werror=return-type" \
-   --define neon=false
+   --define neon=false \
+   --define openmp=true
 
 if [ $? -ne 0 ]; then
   exit 1
diff --git a/tools/export_lib.sh b/tools/export_lib.sh
index db3384e93a80fb8e5b2ab2151f6188ab136201cc..74cc0d6162305deae04a4f30622882ea9d76270c 100755
--- a/tools/export_lib.sh
+++ b/tools/export_lib.sh
@@ -71,6 +71,17 @@ build_target()
     $DSP_MODE_BUILD_FLAGS || exit 1
 }
 
+build_local_target()
+{
+  BAZEL_TARGET=$1
+  bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_OBFUSCATE_LITERALS" \
+    --define openmp=true || exit -1
+}
+
 merge_libs()
 {
   CREATE_LIB_NAME=$1
@@ -113,10 +124,17 @@ bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || e
 
 echo "Step 3: Build libmace targets"
 bazel clean
-for target in ${all_targets[*]}
-do
-  build_target ${target}
-done
+if [ x"${RUNTIME}" = x"local" ]; then
+  for target in ${all_targets[*]}
+  do
+    build_local_target ${target}
+  done
+else
+  for target in ${all_targets[*]}
+  do
+    build_target ${target}
+  done
+fi
 
 
 echo "Step 4: Create mri files and generate merged libs"